# Machine Learning Sederhana dengan Data Outlier

Notebook ini mendemonstrasikan:
1. Membuat dataset dengan outlier
2. Visualisasi dan deteksi outlier
3. Penanganan outlier (IQR, Z-Score)
4. Perbandingan model ML sebelum dan sesudah penanganan outlier

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy import stats

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
print('Libraries loaded successfully!')

## 1. Membuat Dataset dengan Outlier

In [None]:
# Generate data normal
n_samples = 200

# Features
luas_rumah = np.random.normal(100, 30, n_samples)       # Luas rumah (m2)
jumlah_kamar = np.random.randint(1, 6, n_samples)       # Jumlah kamar
jarak_pusat_kota = np.random.normal(10, 5, n_samples)   # Jarak ke pusat kota (km)

# Target: Harga rumah (juta rupiah) - hubungan linear + noise
harga = 200 + 5 * luas_rumah + 50 * jumlah_kamar - 10 * jarak_pusat_kota + np.random.normal(0, 50, n_samples)

# Tambahkan OUTLIER secara manual
n_outliers = 15
outlier_idx = np.random.choice(n_samples, n_outliers, replace=False)

# Outlier pada luas rumah (sangat besar/kecil)
luas_rumah[outlier_idx[:5]] = np.random.choice([300, 350, 400, 10, 5], 5)

# Outlier pada harga (harga tidak wajar)
harga[outlier_idx[5:10]] = np.random.choice([3000, 3500, 4000, 50, 30], 5)

# Outlier pada jarak pusat kota
jarak_pusat_kota[outlier_idx[10:]] = np.random.choice([50, 60, 70, -5, -3], 5)

# Buat DataFrame
df = pd.DataFrame({
    'luas_rumah': luas_rumah,
    'jumlah_kamar': jumlah_kamar,
    'jarak_pusat_kota': jarak_pusat_kota,
    'harga': harga
})

print(f'Shape dataset: {df.shape}')
print(f'Jumlah outlier yang ditambahkan: {n_outliers}')
df.head(10)

In [None]:
df.describe()

## 2. Visualisasi Data & Deteksi Outlier

In [None]:
# Boxplot untuk setiap fitur
fig, axes = plt.subplots(1, 4, figsize=(18, 5))

for i, col in enumerate(df.columns):
    axes[i].boxplot(df[col], vert=True)
    axes[i].set_title(f'Boxplot: {col}', fontsize=12)
    axes[i].set_ylabel('Nilai')

plt.suptitle('Deteksi Outlier dengan Boxplot', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Luas Rumah vs Harga
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].scatter(df['luas_rumah'], df['harga'], alpha=0.6, c='steelblue', edgecolors='k', linewidth=0.5)
axes[0].set_xlabel('Luas Rumah (m²)')
axes[0].set_ylabel('Harga (Juta Rp)')
axes[0].set_title('Luas Rumah vs Harga')

axes[1].scatter(df['jumlah_kamar'], df['harga'], alpha=0.6, c='coral', edgecolors='k', linewidth=0.5)
axes[1].set_xlabel('Jumlah Kamar')
axes[1].set_ylabel('Harga (Juta Rp)')
axes[1].set_title('Jumlah Kamar vs Harga')

axes[2].scatter(df['jarak_pusat_kota'], df['harga'], alpha=0.6, c='mediumseagreen', edgecolors='k', linewidth=0.5)
axes[2].set_xlabel('Jarak ke Pusat Kota (km)')
axes[2].set_ylabel('Harga (Juta Rp)')
axes[2].set_title('Jarak Pusat Kota vs Harga')

plt.suptitle('Scatter Plot - Terlihat Outlier', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. Deteksi Outlier dengan IQR dan Z-Score

In [None]:
# === Metode 1: IQR (Interquartile Range) ===
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print('=== Deteksi Outlier dengan IQR ===')
for col in df.columns:
    outliers, lb, ub = detect_outliers_iqr(df, col)
    print(f'{col}: {len(outliers)} outlier terdeteksi (batas: [{lb:.1f}, {ub:.1f}])')

In [None]:
# === Metode 2: Z-Score ===
def detect_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > threshold]
    return outliers, z_scores

print('=== Deteksi Outlier dengan Z-Score (threshold=3) ===')
for col in df.columns:
    outliers, _ = detect_outliers_zscore(df, col)
    print(f'{col}: {len(outliers)} outlier terdeteksi')

## 4. Penanganan Outlier - Metode IQR

In [None]:
def remove_outliers_iqr(df):
    df_clean = df.copy()
    for col in df_clean.columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    return df_clean

df_clean = remove_outliers_iqr(df)

print(f'Data asli     : {len(df)} baris')
print(f'Data bersih   : {len(df_clean)} baris')
print(f'Data dihapus  : {len(df) - len(df_clean)} baris')

In [None]:
# Visualisasi perbandingan sebelum dan sesudah
fig, axes = plt.subplots(2, 4, figsize=(18, 10))

for i, col in enumerate(df.columns):
    axes[0][i].boxplot(df[col])
    axes[0][i].set_title(f'{col}\n(Sebelum)', fontsize=11)
    
    axes[1][i].boxplot(df_clean[col])
    axes[1][i].set_title(f'{col}\n(Sesudah)', fontsize=11)

plt.suptitle('Perbandingan Boxplot: Sebelum vs Sesudah Penanganan Outlier', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Training Model ML - Perbandingan Sebelum & Sesudah

In [None]:
def train_and_evaluate(dataframe, label=''):
    """Train Linear Regression & Random Forest, return metrics."""
    X = dataframe[['luas_rumah', 'jumlah_kamar', 'jarak_pusat_kota']]
    y = dataframe['harga']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    results = {}
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        results[name] = {
            'R2': r2_score(y_test, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'MAE': mean_absolute_error(y_test, y_pred),
            'y_test': y_test,
            'y_pred': y_pred
        }
    
    return results

In [None]:
# Training dengan data ASLI (ada outlier)
results_with_outlier = train_and_evaluate(df, 'Dengan Outlier')

# Training dengan data BERSIH (tanpa outlier)
results_without_outlier = train_and_evaluate(df_clean, 'Tanpa Outlier')

# Tampilkan perbandingan
print('=' * 70)
print(f'{"Model":<22} {"Kondisi":<18} {"R²":>8} {"RMSE":>10} {"MAE":>10}')
print('=' * 70)

for model_name in ['Linear Regression', 'Random Forest']:
    r1 = results_with_outlier[model_name]
    r2 = results_without_outlier[model_name]
    print(f'{model_name:<22} {"Dengan Outlier":<18} {r1["R2"]:>8.4f} {r1["RMSE"]:>10.2f} {r1["MAE"]:>10.2f}')
    print(f'{"":<22} {"Tanpa Outlier":<18} {r2["R2"]:>8.4f} {r2["RMSE"]:>10.2f} {r2["MAE"]:>10.2f}')
    print('-' * 70)

In [None]:
# Visualisasi Actual vs Predicted
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

configs = [
    (0, 0, results_with_outlier, 'Linear Regression', 'Dengan Outlier', 'steelblue'),
    (0, 1, results_with_outlier, 'Random Forest', 'Dengan Outlier', 'coral'),
    (1, 0, results_without_outlier, 'Linear Regression', 'Tanpa Outlier', 'steelblue'),
    (1, 1, results_without_outlier, 'Random Forest', 'Tanpa Outlier', 'coral'),
]

for row, col, results, model_name, condition, color in configs:
    ax = axes[row][col]
    r = results[model_name]
    
    ax.scatter(r['y_test'], r['y_pred'], alpha=0.6, c=color, edgecolors='k', linewidth=0.5)
    
    # Garis diagonal (perfect prediction)
    min_val = min(r['y_test'].min(), r['y_pred'].min())
    max_val = max(r['y_test'].max(), r['y_pred'].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect')
    
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    ax.set_title(f'{model_name}\n({condition}) | R²={r["R2"]:.4f}', fontsize=11)
    ax.legend()

plt.suptitle('Actual vs Predicted: Pengaruh Outlier pada Model', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Bar chart perbandingan R² Score
models = ['Linear Regression', 'Random Forest']
r2_with = [results_with_outlier[m]['R2'] for m in models]
r2_without = [results_without_outlier[m]['R2'] for m in models]

x = np.arange(len(models))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, r2_with, width, label='Dengan Outlier', color='salmon', edgecolor='black')
bars2 = ax.bar(x + width/2, r2_without, width, label='Tanpa Outlier', color='mediumseagreen', edgecolor='black')

ax.set_ylabel('R² Score', fontsize=12)
ax.set_title('Perbandingan R² Score: Dengan vs Tanpa Outlier', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(models, fontsize=12)
ax.legend(fontsize=11)
ax.set_ylim(0, 1.1)

# Tambahkan label nilai
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02, 
            f'{bar.get_height():.3f}', ha='center', fontsize=11)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02, 
            f'{bar.get_height():.3f}', ha='center', fontsize=11)

plt.tight_layout()
plt.show()

## 6. Kesimpulan

Dari eksperimen ini dapat disimpulkan:

1. **Outlier mempengaruhi performa model** — terutama Linear Regression yang sangat sensitif terhadap outlier karena meminimalkan sum of squared errors.
2. **Random Forest lebih robust** terhadap outlier dibanding Linear Regression karena menggunakan ensemble dari decision trees.
3. **Penanganan outlier dengan IQR** berhasil meningkatkan R² dan menurunkan RMSE/MAE pada kedua model.
4. **Selalu lakukan EDA dan deteksi outlier** sebelum training model untuk hasil yang lebih akurat.