# Model KNN untuk Prediksi Kualitas Udara

## Deskripsi
Model machine learning menggunakan K-Nearest Neighbors untuk memprediksi kualitas udara berdasarkan 6 parameter:
- PM10, PM2.5, SO2, CO, O3, NO2

## Target Klasifikasi
- **Baik**: Kualitas udara sehat
- **Sedang**: Kualitas udara dapat diterima  
- **Buruk**: Kualitas udara tidak sehat

In [None]:
# Import semua library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Setting untuk visualisasi
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Load dan Explorasi Data

In [None]:
# Load dataset - ganti dengan path file Anda
try:
    # Jika menggunakan sample data yang disediakan
    df = pd.read_csv('sample_air_quality_data.csv')
    print("✅ Dataset berhasil dimuat!")
except FileNotFoundError:
    print("❌ File dataset tidak ditemukan!")
    print("Pastikan file CSV ada di folder yang sama dengan notebook ini")
    print("Format yang diharapkan: PM10, PM2.5, SO2, CO, O3, NO2, Quality")
    
# Tampilkan informasi dasar dataset
print(f"\n📊 Shape dataset: {df.shape}")
print(f"📋 Kolom: {list(df.columns)}")
print("\n🔍 5 baris pertama:")
df.head()

In [None]:
# Cek informasi detail dataset
print("📈 Info Dataset:")
df.info()

print("\n📊 Statistik Deskriptif:")
df.describe()

In [None]:
# Cek missing values
print("🔍 Missing Values:")
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print(missing_data[missing_data > 0])
    print(f"\n⚠️  Total missing values: {missing_data.sum()}")
else:
    print("✅ Tidak ada missing values!")

# Cek distribusi target variable
print("\n🎯 Distribusi Kualitas Udara:")
quality_counts = df['Quality'].value_counts()
print(quality_counts)

# Visualisasi distribusi target
plt.figure(figsize=(8, 5))
quality_counts.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title('Distribusi Kualitas Udara')
plt.xlabel('Kategori Kualitas')
plt.ylabel('Jumlah Data')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 2. Data Preprocessing

In [None]:
# Membuat copy dataset untuk preprocessing
df_processed = df.copy()

# Handle missing values jika ada
if df_processed.isnull().sum().sum() > 0:
    print("🔧 Menangani missing values...")
    # Fill dengan median untuk data numerik
    numeric_columns = ['PM10', 'PM2.5', 'SO2', 'CO', 'O3', 'NO2']
    for col in numeric_columns:
        if df_processed[col].isnull().sum() > 0:
            median_val = df_processed[col].median()
            df_processed[col].fillna(median_val, inplace=True)
            print(f"   - {col}: filled dengan median = {median_val:.2f}")

# Validasi format kolom target
expected_categories = ['Baik', 'Sedang', 'Buruk']
unique_categories = df_processed['Quality'].unique()

print(f"\n🏷️  Kategori yang ditemukan: {list(unique_categories)}")

# Standardisasi nama kategori jika perlu
category_mapping = {
    'good': 'Baik', 'Good': 'Baik', 'GOOD': 'Baik',
    'moderate': 'Sedang', 'Moderate': 'Sedang', 'MODERATE': 'Sedang',
    'bad': 'Buruk', 'Bad': 'Buruk', 'BAD': 'Buruk',
    'poor': 'Buruk', 'Poor': 'Buruk', 'POOR': 'Buruk'
}

if not all(cat in expected_categories for cat in unique_categories):
    print("🔄 Melakukan standardisasi kategori...")
    df_processed['Quality'] = df_processed['Quality'].replace(category_mapping)
    print(f"   Kategori setelah standardisasi: {list(df_processed['Quality'].unique())}")

print("\n✅ Preprocessing selesai!")

## 3. Eksplorasi Data Visual

In [None]:
# Correlation heatmap
numeric_features = ['PM10', 'PM2.5', 'SO2', 'CO', 'O3', 'NO2']
correlation_matrix = df_processed[numeric_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Korelasi Antar Parameter Kualitas Udara')
plt.tight_layout()
plt.show()

In [None]:
# Box plots untuk setiap parameter berdasarkan kualitas
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(numeric_features):
    sns.boxplot(data=df_processed, x='Quality', y=feature, ax=axes[i])
    axes[i].set_title(f'Distribusi {feature} per Kualitas Udara')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Persiapan Data untuk Model

In [None]:
# Pisahkan features dan target
X = df_processed[numeric_features]
y = df_processed['Quality']

print(f"📐 Shape features (X): {X.shape}")
print(f"🎯 Shape target (y): {y.shape}")

# Encode target variable menjadi numerik
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Simpan mapping untuk interpretasi hasil
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"\n🏷️  Label mapping: {label_mapping}")

# Split data training dan testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\n📊 Data Training: {X_train.shape[0]} samples")
print(f"📊 Data Testing: {X_test.shape[0]} samples")

In [None]:
# Feature Scaling - PENTING untuk KNN!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("⚖️  Feature scaling selesai!")
print(f"   - Mean sebelum scaling: {X_train.mean().round(2).to_dict()}")
print(f"   - Mean setelah scaling: {X_train_scaled.mean(axis=0).round(2)}")
print(f"   - Std setelah scaling: {X_train_scaled.std(axis=0).round(2)}")

## 5. Training Model KNN

In [None]:
# Mencari nilai K terbaik dengan cross-validation
k_range = range(1, 21)
cv_scores = []

print("🔍 Mencari nilai K optimal...")
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())
    if k % 5 == 0:
        print(f"   K={k}: CV Score = {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Plot hasil cross-validation
plt.figure(figsize=(10, 6))
plt.plot(k_range, cv_scores, marker='o')
plt.xlabel('Nilai K')
plt.ylabel('Cross-Validation Accuracy')
plt.title('KNN: Pencarian Nilai K Optimal')
plt.grid(True, alpha=0.3)
optimal_k = k_range[np.argmax(cv_scores)]
plt.axvline(x=optimal_k, color='red', linestyle='--', label=f'Optimal K = {optimal_k}')
plt.legend()
plt.show()

print(f"\n🎯 Nilai K optimal: {optimal_k}")
print(f"📈 CV Score terbaik: {max(cv_scores):.4f}")

In [None]:
# Training model dengan K optimal
best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
best_knn.fit(X_train_scaled, y_train)

print(f"✅ Model KNN berhasil dilatih dengan K = {optimal_k}")

## 6. Evaluasi Model

In [None]:
# Prediksi pada data testing
y_pred = best_knn.predict(X_test_scaled)

# Hitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Akurasi Model: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Classification report
target_names = label_encoder.classes_
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.tight_layout()
plt.show()

# Interpretasi confusion matrix
print("\n🔍 Interpretasi Confusion Matrix:")
for i, actual_class in enumerate(target_names):
    for j, pred_class in enumerate(target_names):
        if cm[i][j] > 0:
            if i == j:
                print(f"   ✅ {cm[i][j]} sampel '{actual_class}' diprediksi benar")
            else:
                print(f"   ❌ {cm[i][j]} sampel '{actual_class}' salah diprediksi sebagai '{pred_class}'")

## 7. Fungsi Prediksi untuk Data Baru

In [None]:
def predict_air_quality(pm10, pm25, so2, co, o3, no2):
    """
    Fungsi untuk memprediksi kualitas udara berdasarkan parameter input
    
    Parameters:
    pm10, pm25, so2, co, o3, no2: nilai parameter kualitas udara
    
    Returns:
    str: prediksi kualitas udara ('Baik', 'Sedang', 'Buruk')
    """
    # Buat array input
    input_data = np.array([[pm10, pm25, so2, co, o3, no2]])
    
    # Scale input menggunakan scaler yang sudah di-fit
    input_scaled = scaler.transform(input_data)
    
    # Prediksi
    prediction = best_knn.predict(input_scaled)
    prediction_proba = best_knn.predict_proba(input_scaled)
    
    # Convert ke label string
    predicted_class = label_encoder.inverse_transform(prediction)[0]
    
    # Tampilkan hasil
    print(f"\n🔬 Input Parameter:")
    print(f"   PM10: {pm10}, PM2.5: {pm25}, SO2: {so2}")
    print(f"   CO: {co}, O3: {o3}, NO2: {no2}")
    print(f"\n🎯 Prediksi: {predicted_class}")
    print(f"📊 Confidence:")
    for i, class_name in enumerate(target_names):
        confidence = prediction_proba[0][i] * 100
        print(f"   {class_name}: {confidence:.1f}%")
    
    return predicted_class

# Test fungsi dengan data contoh
print("🧪 Test Prediksi dengan Data Contoh:")
print("="*50)

# Contoh data dengan kualitas baik (nilai rendah)
print("📋 Contoh 1 - Data dengan polusi rendah:")
result1 = predict_air_quality(pm10=30, pm25=15, so2=10, co=5, o3=50, no2=20)

print("\n" + "="*50)

# Contoh data dengan kualitas buruk (nilai tinggi)
print("📋 Contoh 2 - Data dengan polusi tinggi:")
result2 = predict_air_quality(pm10=150, pm25=80, so2=100, co=50, o3=200, no2=100)

## 8. Menyimpan Model

In [None]:
import joblib

# Simpan model dan scaler
joblib.dump(best_knn, 'knn_air_quality_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

print("💾 Model berhasil disimpan!")
print("   - knn_air_quality_model.pkl")
print("   - feature_scaler.pkl")
print("   - label_encoder.pkl")

# Test load ulang model
print("\n🔄 Test load model:")
loaded_model = joblib.load('knn_air_quality_model.pkl')
loaded_scaler = joblib.load('feature_scaler.pkl')
loaded_encoder = joblib.load('label_encoder.pkl')
print("✅ Model berhasil di-load ulang!")

## 9. Ringkasan Hasil

In [None]:
print("\n" + "="*60)
print("📈 RINGKASAN HASIL MODEL KNN PREDIKSI KUALITAS UDARA")
print("="*60)

print(f"📊 Dataset: {df.shape[0]} sampel, {df.shape[1]} fitur")
print(f"🎯 Target: 3 kategori (Baik, Sedang, Buruk)")
print(f"⚙️  Parameter: PM10, PM2.5, SO2, CO, O3, NO2")
print(f"🔧 Algoritma: K-Nearest Neighbors")
print(f"📐 K Optimal: {optimal_k}")
print(f"🎯 Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"📋 CV Score: {max(cv_scores):.4f}")

print("\n📝 Cara Menggunakan Model:")
print("1. Load model: joblib.load('knn_air_quality_model.pkl')")
print("2. Load scaler: joblib.load('feature_scaler.pkl')")
print("3. Load encoder: joblib.load('label_encoder.pkl')")
print("4. Scale input data dengan scaler")
print("5. Prediksi dengan model")
print("6. Convert hasil dengan encoder")

print("\n✅ Model siap digunakan untuk prediksi kualitas udara!")
print("="*60)