<a href="https://colab.research.google.com/github/Jammyeong/MachineLearningClass/blob/main/4thWeek/Tugas_ML_4_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [19]:
# Memuat dataset training
df = pd.read_csv('/content/drive/MyDrive/smt akhir/ml/train.csv')

In [20]:
# Menghapus fitur yang tidak relevan
df.drop(columns=["Id"], inplace=True)

# Encoding fitur kategorikal
categorical_features = df.select_dtypes(include=["object"]).columns

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [21]:
# Normalisasi fitur numerik
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns
numerical_features = numerical_features.drop("SalePrice")  # Target tidak dinormalisasi

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Membagi dataset menjadi training dan testing
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menampilkan ukuran dataset setelah split
print("Ukuran Training Set:", X_train.shape)
print("Ukuran Test Set:", X_test.shape)

Ukuran Training Set: (1168, 79)
Ukuran Test Set: (292, 79)


In [22]:
# Imputasi Missing Values
for col in X_train.columns:
    if X_train[col].dtype == "object":
        mode_val = X_train[col].mode()[0]
        X_train[col] = X_train[col].fillna(mode_val)
        X_test[col] = X_test[col].fillna(mode_val)
    else:
        med_val = X_train[col].median()
        X_train[col] = X_train[col].fillna(med_val)
        X_test[col] = X_test[col].fillna(med_val)

# Periksa apakah masih ada NaN setelah imputasi
print("Missing values setelah imputasi:", X_train.isnull().sum().sum(), X_test.isnull().sum().sum())

# Jika outputnya 0, lanjutkan ke langkah berikutnya:
model = LinearRegression()
model.fit(X_train, y_train)

Missing values setelah imputasi: 0 0


In [23]:
# Memuat dataset test
df_test = pd.read_csv('/content/drive/MyDrive/smt akhir/ml/test.csv')

In [24]:
# Pastikan test dataset memiliki fitur yang sama dengan training set
df_test.drop(columns=["Id"], inplace=True)

# Encoding fitur kategorikal
categorical_features = df_test.select_dtypes(include=["object"]).columns
for col in categorical_features:
    le = LabelEncoder()
    df_test[col] = le.fit_transform(df_test[col])

In [25]:
# Normalisasi fitur numerik
numerical_features = df_test.select_dtypes(include=["int64", "float64"]).columns
scaler = StandardScaler()
df_test[numerical_features] = scaler.fit_transform(df_test[numerical_features])

# Latih model menggunakan training set
model = LinearRegression()
model.fit(X_train, y_train)

# Prediksi pada training dan test dataset
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)  # Menggunakan X_test, bukan df_test karena df_test tidak memiliki label

In [26]:
# Evaluasi performa model pada training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

# Evaluasi performa model pada test set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
r2_test = r2_score(y_test, y_test_pred)

# Training set
print("Training Set:")
print(f"MSE : {mse_train:.3f}")
print(f"RMSE: {rmse_train:.3f}")
print(f"R²  : {r2_train:.3f}\n")

# Test set
print("Test Set:")
print(f"MSE : {mse_test:.3f}")
print(f"RMSE: {rmse_test:.3f}")
print(f"R²  : {r2_test:.3f}\n")

# Deteksi Overfitting
if r2_train > r2_test:
    print("Model mengalami overfitting.")
else:
    print("Model tidak mengalami overfitting.")

Training Set:
MSE : 865095022.491
RMSE: 29412.498
R²  : 0.855

Test Set:
MSE : 1194633476.545
RMSE: 34563.470
R²  : 0.844

Model mengalami overfitting.


### 📐 Mean Squared Error (MSE)
MSE = (1 / n) * Σ (yᵢ - ŷᵢ)²

Keterangan:
- yᵢ  : nilai aktual ke-i
- ŷᵢ  : nilai prediksi ke-i
- n   : jumlah sampel
- Semakin kecil nilai MSE, semakin baik performa model.


### 📐 Root Mean Squared Error (RMSE)
RMSE = √( (1 / n) * Σ (yᵢ - ŷᵢ)² ) = √MSE

Keterangan:
- Mengembalikan satuan ke bentuk asli target.
- Semakin kecil nilai RMSE, semakin kecil kesalahan prediksi.


### 📐 R² Score (Koefisien Determinasi)
R² = 1 - (Σ (yᵢ - ŷᵢ)²) / (Σ (yᵢ - ȳ)²)

Keterangan:
- yᵢ  : nilai aktual ke-i
- ŷᵢ  : nilai prediksi ke-i
- ȳ   : rata-rata nilai aktual
- R² menunjukkan seberapa baik model menjelaskan variabilitas data.
- Nilai R² mendekati 1 → model sangat baik.
