<a href="https://colab.research.google.com/github/Kinnaruo/MachineLearning/blob/main/Week%203/Regression_Infrared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/farrelrassya/teachingMLDL/refs/heads/main/02.%20Deep%20Learning/Dataset/Infrared.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Distance,T_offset1,Max1R13_1,Max1L13_1,aveAllR13_1,...,T_FHRC1,T_FHLC1,T_FHBC1,T_FHTC1,T_FH_Max1,T_FHC_Max1,T_Max1,T_OR1,T_OR_Max1,aveOralM
0,Male,41-50,White,24.0,28.0,0.8,0.7025,35.03,35.3775,34.4,...,33.4775,33.3725,33.4925,33.0025,34.53,34.0075,35.6925,35.635,35.6525,36.59
1,Female,31-40,Black or African-American,24.0,26.0,0.8,0.78,34.55,34.52,33.93,...,34.055,33.6775,33.97,34.0025,34.6825,34.66,35.175,35.0925,35.1075,37.19
2,Female,21-30,White,24.0,26.0,0.8,0.8625,35.6525,35.5175,34.2775,...,34.8275,34.6475,34.82,34.67,35.345,35.2225,35.9125,35.86,35.885,37.34
3,Female,21-30,Black or African-American,24.0,27.0,0.8,0.93,35.2225,35.6125,34.385,...,34.4225,34.655,34.3025,34.9175,35.6025,35.315,35.72,34.965,34.9825,37.09
4,Male,18-20,White,24.0,27.0,0.8,0.895,35.545,35.665,34.91,...,35.16,34.3975,34.67,33.8275,35.4175,35.3725,35.895,35.5875,35.6175,37.04


In [None]:
df.dropna()
X = df.drop(columns="aveOralM")
y = df["aveOralM"]

In [None]:
numeric_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ]
)

In [None]:
# Pipeline Model KNN
knn_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", KNeighborsRegressor(n_neighbors=5))
])

# Pipeline Model Decision Tree
dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_pipeline.fit(X_train, y_train)
knn_preds = knn_pipeline.predict(X_test)

dt_pipeline.fit(X_train, y_train)
dt_preds = dt_pipeline.predict(X_test)

In [None]:
def evaluate(y_true, y_pred, name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name}:\n  MSE  = {mse:.4f}\n  RMSE = {rmse:.4f}\n  R²    = {r2:.4f}")

evaluate(y_test, knn_preds, "K-Nearest Neighbors")
evaluate(y_test, dt_preds, "Decision Tree")


K-Nearest Neighbors:
  MSE  = 0.0907
  RMSE = 0.3012
  R²    = 0.5692

Decision Tree:
  MSE  = 0.1154
  RMSE = 0.3397
  R²    = 0.4521


##**Mean Squared Error (MSE):**
MSE mengukur rata-rata dari kuadrat selisih antara nilai aktual $y$ dan nilai prediksi $\hat{y}$. Semakin mendekati 0, berarti semakin kecil error yang dibuat model.

  $$
  MSE = \frac{1}{n} \sum (y_i - \hat{y}_i)^2
  $$

Di mana:  
- $y_i$ = Nilai aktual (data sebenarnya)  
- $\hat{y}_i$ = Nilai prediksi dari model  
- $n$ = Jumlah sampel  

##**Root Mean Squared Error (RMSE):**
RMSE adalah akar kuadrat dari MSE, sehingga unitnya sama dengan unit target. Sama seperti MSE, semakin mendekati 0, semakin bagus.
  $$
  RMSE = \sqrt{MSE}
  $$

##**R-squared (\( R^2 \)):**
$R^2$ mengukur seberapa baik model menjelaskan variabilitas data. Nilainya antara **0 dan 1**, semakin mendekati **1** semakin bagus. Jika nilainya lebih kecil dari 0, maka model lebih buruk daripada rata-rata.

  $$
  R^2 = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2}
  $$

In [14]:
import shutil
shutil.copyfile(__file__, "/content/Regression Infrared.ipynb")

NameError: name '__file__' is not defined