# New Section

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset from GitHub raw URL
data_url = (
    'https://raw.githubusercontent.com/farrelrassya/teachingMLDL/'
    'main/02.%20Deep%20Learning/Dataset/Infrared.csv'
)
df = pd.read_csv(data_url)

# Display first rows
df.head()


Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Distance,T_offset1,Max1R13_1,Max1L13_1,aveAllR13_1,...,T_FHRC1,T_FHLC1,T_FHBC1,T_FHTC1,T_FH_Max1,T_FHC_Max1,T_Max1,T_OR1,T_OR_Max1,aveOralM
0,Male,41-50,White,24.0,28.0,0.8,0.7025,35.03,35.3775,34.4,...,33.4775,33.3725,33.4925,33.0025,34.53,34.0075,35.6925,35.635,35.6525,36.59
1,Female,31-40,Black or African-American,24.0,26.0,0.8,0.78,34.55,34.52,33.93,...,34.055,33.6775,33.97,34.0025,34.6825,34.66,35.175,35.0925,35.1075,37.19
2,Female,21-30,White,24.0,26.0,0.8,0.8625,35.6525,35.5175,34.2775,...,34.8275,34.6475,34.82,34.67,35.345,35.2225,35.9125,35.86,35.885,37.34
3,Female,21-30,Black or African-American,24.0,27.0,0.8,0.93,35.2225,35.6125,34.385,...,34.4225,34.655,34.3025,34.9175,35.6025,35.315,35.72,34.965,34.9825,37.09
4,Male,18-20,White,24.0,27.0,0.8,0.895,35.545,35.665,34.91,...,35.16,34.3975,34.67,33.8275,35.4175,35.3725,35.895,35.5875,35.6175,37.04


# Setup-setup

In [10]:
# Check shape and missing values
print("Shape:", df.shape)
print("Missing values per column:\n", df.isna().sum())

# Basic statistics
df.describe()
# Separate features and target
X = df.drop(columns=['aveOralM'])
y = df['aveOralM']

# Encode categorical variables (if any) using one-hot encoding
df_encoded = pd.get_dummies(X, drop_first=True)

# Update X to the encoded DataFrame
X = df_encoded

# Impute missing values using mean strategy
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
# Fit imputer and transform, keeping column names
df_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X = df_imputed

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features for K-NN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Shape: (1020, 34)
Missing values per column:
 Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       2
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
aveOralM       0
dtype: int64





# Setup K-NN dan Decision Tree

In [12]:
# Initialize and train K-NN regressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_knn = knn.predict(X_test_scaled)
# Initialize and train Decision Tree regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Predict on test set
y_pred_dt = dt.predict(X_test)

# Evaluasi

In [13]:

# Define a function to compute metrics
def evaluate_model(y_true, y_pred, name="Model"):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} Performance:")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R2:   {r2:.4f}\n")

# Evaluate K-NN
evaluate_model(y_test, y_pred_knn, "K-NN Regressor")

# Evaluate Decision Tree
evaluate_model(y_test, y_pred_dt, "Decision Tree Regressor")

K-NN Regressor Performance:
  MSE:  0.0727
  RMSE: 0.2697
  R2:   0.6547

Decision Tree Regressor Performance:
  MSE:  0.1264
  RMSE: 0.3556
  R2:   0.3996

