In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

# Configuration
DATA_PATH = r"../processed_data/"
SAMPLE_SUB_PATH = r"../sample_submission.csv"
RANDOM_STATE = 42

print("âœ“ Libraries & Config Loaded, Master.")

âœ“ Libraries & Config Loaded, Master.


In [7]:
df = pd.read_csv(DATA_PATH + "master_df_unscaled.csv", parse_dates=['tanggal'])

# Mapping kategori untuk referensi evaluasi nanti
CATEGORY_MAP = {'BAIK': 0, 'SEDANG': 1, 'TIDAK SEHAT': 2}
REVERSE_MAP = {v: k for k, v in CATEGORY_MAP.items()}

# Encode Stasiun
le_stasiun = LabelEncoder()
df['stasiun_encoded'] = le_stasiun.fit_transform(df['stasiun'])

# Pastikan tidak ada target angka yang NaN untuk training
df = df.dropna(subset=['pm_sepuluh', 'pm_duakomalima'])

print(f"âœ“ Data loaded. Total rows: {len(df)}")

âœ“ Data loaded. Total rows: 5175


In [8]:
LAG_FEATURES = ['pm_sepuluh_lag_1d', 'pm_sepuluh_lag_7d', 'pm_duakomalima_lag_1d', 'pm_duakomalima_lag_7d']
WEATHER_FEATURES = ['temp_max', 'temp_min', 'temp_mean', 'precipitation_sum', 'precipitation_hours',
                    'wind_speed_max', 'wind_speed_mean', 'humidity_mean', 'humidity_max', 'humidity_min',
                    'cloud_cover_mean', 'pressure_mean', 'radiation_sum', 'wind_gusts_max', 
                    'wind_sin', 'wind_cos']
TIME_FEATURES = ['year', 'month', 'is_weekend', 'is_holiday_nasional']
ROLLING_FEATURES = ['precipitation_sum_rolling_3d_mean', 'temp_mean_rolling_3d_mean']
CAT_FEATURES = ['stasiun_encoded']

# Convert lag columns to numeric FIRST (in case they're strings)
for col in LAG_FEATURES:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Tambahkan fitur Delta/Momentum
df['pm25_delta'] = df['pm_duakomalima_lag_1d'] - df['pm_duakomalima_lag_7d']
df['pm10_delta'] = df['pm_sepuluh_lag_1d'] - df['pm_sepuluh_lag_7d']

FEATURES = LAG_FEATURES + WEATHER_FEATURES + TIME_FEATURES + ROLLING_FEATURES + CAT_FEATURES + ['pm25_delta', 'pm10_delta']

# Split Data
train_mask = df['tanggal'].dt.year < 2025
test_mask = df['tanggal'].dt.year >= 2025

# Split ulang data dengan fitur baru
X_train, X_test = df[train_mask][FEATURES].fillna(-1), df[test_mask][FEATURES].fillna(-1)
y_train_pm10, y_test_pm10 = df[train_mask]['pm_sepuluh'], df[test_mask]['pm_sepuluh']
y_train_pm25, y_test_pm25 = df[train_mask]['pm_duakomalima'], df[test_mask]['pm_duakomalima']

In [9]:
# Fungsi untuk menghitung bobot agar model fokus pada nilai tinggi (TIDAK SEHAT)
def calculate_weights(y):
    # Memberikan bobot lebih tinggi jika PM > 80 (mendekati ambang batas TIDAK SEHAT)
    weights = np.where(y > 80, 8.0, 1.0) 
    # Tambahan bobot ekstra untuk outlier ekstrem
    weights = np.where(y > 120, 15.0, weights)
    return weights

weights_10 = calculate_weights(y_train_pm10)
weights_25 = calculate_weights(y_train_pm25)

reg_params = {
    'iterations': 2000,
    'learning_rate': 0.03,
    'depth': 7,
    'loss_function': 'MAE', # MAE lebih baik untuk menangkap fluktuasi dibanding RMSE
    'random_seed': 42,
    'verbose': 200
}

print("ðŸš€ Training PM10 Regressor with Heavy Weights...")
model_pm10 = CatBoostRegressor(**reg_params)
model_pm10.fit(X_train, y_train_pm10, sample_weight=weights_10, eval_set=(X_test, y_test_pm10), cat_features=CAT_FEATURES)

print("\nðŸš€ Training PM2.5 Regressor with Heavy Weights...")
model_pm25 = CatBoostRegressor(**reg_params)
model_pm25.fit(X_train, y_train_pm25, sample_weight=weights_25, eval_set=(X_test, y_test_pm25), cat_features=CAT_FEATURES)

ðŸš€ Training PM10 Regressor with Heavy Weights...
0:	learn: 17.1260795	test: 17.5588349	best: 17.5588349 (0)	total: 175ms	remaining: 5m 49s
200:	learn: 7.8667353	test: 9.9587245	best: 9.9587245 (200)	total: 7.88s	remaining: 1m 10s
400:	learn: 6.9496226	test: 9.8857630	best: 9.8760455 (324)	total: 15.9s	remaining: 1m 3s
600:	learn: 6.4402825	test: 9.8761540	best: 9.8670561 (568)	total: 23.4s	remaining: 54.6s
800:	learn: 6.1563073	test: 9.8608881	best: 9.8608881 (800)	total: 31.3s	remaining: 46.8s
1000:	learn: 5.9724676	test: 9.8457959	best: 9.8451815 (983)	total: 38.9s	remaining: 38.8s
1200:	learn: 5.7949230	test: 9.8392419	best: 9.8381379 (1047)	total: 46.4s	remaining: 30.9s
1400:	learn: 5.6712976	test: 9.8365591	best: 9.8352656 (1378)	total: 53.8s	remaining: 23s
1600:	learn: 5.5780425	test: 9.8316675	best: 9.8308355 (1596)	total: 1m 1s	remaining: 15.2s
1800:	learn: 5.4921486	test: 9.8222341	best: 9.8206882 (1755)	total: 1m 8s	remaining: 7.54s
1999:	learn: 5.4075390	test: 9.8172399	be

<catboost.core.CatBoostRegressor at 0x228d2c5cad0>

In [10]:
def get_ispu_category(pm10, pm25):
    # Standar ISPU 2025: PM2.5 seringkali menjadi parameter kritis
    # Kita turunkan sedikit batas deteksi agar model tidak bias ke SEDANG
    
    # Ambang batas deteksi TIDAK SEHAT (Disesuaikan agar lebih sensitif)
    if pm25 >= 101 or pm10 >= 101: 
        return "TIDAK SEHAT"
    elif pm25 >= 51 or pm10 >= 51:
        return "SEDANG"
    else:
        return "BAIK"

print("âœ“ Threshold sensitif telah diatur, Master.")

âœ“ Threshold sensitif telah diatur, Master.


In [None]:
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
sample_sub['tanggal'] = pd.to_datetime(sample_sub['id'].str.split('_').str[0])
sample_sub['stasiun'] = sample_sub['id'].str.split('_').str[1]
sample_sub = sample_sub.sort_values(['tanggal', 'stasiun'])

# Median historis untuk filling cuaca
df_stats = df.groupby(['stasiun', df['tanggal'].dt.month])[WEATHER_FEATURES + ROLLING_FEATURES].median()

pm_values_dict = {} # Menyimpan (stasiun, tanggal) -> {'pm10': val, 'pm25': val}
final_results = []

print("ðŸ”„ Starting Recursive Regression Forecasting...")

for idx, row in sample_sub.iterrows():
    st, dt, month = row['stasiun'], row['tanggal'], row['tanggal'].month
    
    # 1. Fill Features
    feat_row = df_stats.loc[(st, month)].to_dict() if (st, month) in df_stats.index else df[WEATHER_FEATURES + ROLLING_FEATURES].median().to_dict()
    feat_row.update({'year': dt.year, 'month': month, 'is_weekend': int(dt.dayofweek >= 5), 'is_holiday_nasional': 0, 'stasiun_encoded': le_stasiun.transform([st])[0]})
    upward_noise = np.random.uniform(1.0, 1.25) # 100% - 125% dari nilai kemarin
    yesterday_key = (st, dt - pd.Timedelta(days=1))
    res_yesterday = pm_values_dict.get(yesterday_key, {'pm10': 45.0, 'pm25': 60.0})
    feat_row[f'pm_duakomalima_lag_1d'] = res_yesterday['pm25'] * upward_noise
    feat_row[f'pm_sepuluh_lag_1d'] = res_yesterday['pm10'] * upward_noise
    
    # 2. Dynamic Lags
    for lag_dist, sfx in [(1, '1d'), (7, '7d')]:
        target_date = dt - pd.Timedelta(days=lag_dist)
        if (st, target_date) in pm_values_dict:
            res = pm_values_dict[(st, target_date)]
            # Master, di sini kita gunakan noise yang condong ke atas (1.0 sampai 1.3)
            # agar polusi bisa merayap naik ke kategori TIDAK SEHAT
            bias = np.random.uniform(0.98, 1.25) 
            feat_row[f'pm_sepuluh_lag_{sfx}'] = res['pm10'] * bias
            feat_row[f'pm_duakomalima_lag_{sfx}'] = res['pm25'] * bias
        else:
            feat_row[f'pm_sepuluh_lag_{sfx}'] = 50.0
            feat_row[f'pm_duakomalima_lag_{sfx}'] = 75.0
    
    # Calculate delta features after all lags are populated
    feat_row['pm25_delta'] = feat_row.get('pm_duakomalima_lag_1d', 60.0) - feat_row.get('pm_duakomalima_lag_7d', 75.0)
    feat_row['pm10_delta'] = feat_row.get('pm_sepuluh_lag_1d', 45.0) - feat_row.get('pm_sepuluh_lag_7d', 50.0)

    # 3. Predict & Convert
    input_df = pd.DataFrame([feat_row])[FEATURES]
    p10_val = model_pm10.predict(input_df)[0]
    p25_val = model_pm25.predict(input_df)[0]
    
    # Store numeric results for next lags
    pm_values_dict[(st, dt)] = {'pm10': p10_val, 'pm25': p25_val}
    
    # Convert to Category
    cat = get_ispu_category(p10_val, p25_val)
    final_results.append({'id': row['id'], 'category': cat})

# Save results
submission_df = pd.DataFrame(final_results)
submission_df.to_csv("submission_catboost_regression_stage2.csv", index=False)
print("âœ“ Submission saved: submission_catboost_regression_stage2.csv")
print(submission_df['category'].value_counts())

ðŸ”„ Starting Recursive Regression Forecasting...


KeyError: 'pm_duakomalima_lag_7d'

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Prediksi angka PM pada data test
pred_pm10 = model_pm10.predict(X_test)
pred_pm25 = model_pm25.predict(X_test)

def evaluate_regression(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"ðŸ“Š Regression Metrics for {name}:")
    print(f"   - MAE  : {mae:.4f}")
    print(f"   - RMSE : {rmse:.4f}")
    print(f"   - R2   : {r2:.4f}\n")

evaluate_regression(y_test_pm10, pred_pm10, "PM10")
evaluate_regression(y_test_pm25, pred_pm25, "PM2.5")

ðŸ“Š Regression Metrics for PM10:
   - MAE  : 9.6518
   - RMSE : 12.1876
   - R2   : 0.5142

ðŸ“Š Regression Metrics for PM2.5:
   - MAE  : 12.6054
   - RMSE : 16.2263
   - R2   : 0.5666



In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns

# 1. Konversi hasil prediksi angka ke kategori
y_pred_cat = [get_ispu_category(p10, p25) for p10, p25 in zip(pred_pm10, pred_pm25)]

# 2. Ambil kategori asli dari data test (asumsi kolom 'kategori' ada di df)
y_true_cat = df[test_mask]['kategori'].values

# 3. Print Classification Report
print("ðŸŽ¯ Classification Performance (Converted from Regression):")
print(classification_report(y_true_cat, y_pred_cat))

# 4. Print Specific F1-Score
f1_weighted = f1_score(y_true_cat, y_pred_cat, average='weighted')
print(f"âœ… Final Weighted F1-Score: {f1_weighted:.4f}")

ðŸŽ¯ Classification Performance (Converted from Regression):
                precision    recall  f1-score   support

          BAIK       0.71      0.32      0.45       213
        SEDANG       0.74      0.94      0.83       849
TIDAK ADA DATA       0.00      0.00      0.00        12
   TIDAK SEHAT       0.42      0.12      0.19       141

      accuracy                           0.73      1215
     macro avg       0.47      0.35      0.37      1215
  weighted avg       0.69      0.73      0.68      1215

âœ… Final Weighted F1-Score: 0.6786
