In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import os

# 1. DEFINISI FILE (Sesuai cara panggil yang Anda minta)
files_dict = {
    2010: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv",
    2011: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-komponen-data.csv",
    2012: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv",
    2013: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2013-komponen-data.csv",
    2014: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv",
    2015: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv",
    2016: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv",
    2017: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv",
    2018: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv",
    2019: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2019-komponen-data.csv",
    2020: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2020-komponen-data.csv",
    2021: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2021-komponen-data.csv",
    2022: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2022-komponen-data.csv",
    2023: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv",
    2024: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv",
    2025: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv"
}

# 2. FUNGSI PEMBANTU (CLEANING)
def clean_stasiun(s):
    s = str(s).upper()
    for code in ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']:
        if code in s: return code
    return 'UNKNOWN'

def parse_date(x):
    try:
        # Menangani angka serial Excel (misal di data 2022)
        if isinstance(x, (int, float)) or (isinstance(x, str) and x.replace('.','').isdigit()):
            return pd.to_datetime('1899-12-30') + pd.to_timedelta(float(x), unit='D')
        return pd.to_datetime(x)
    except:
        return pd.NaT

all_data = []

# 3. LOOPING PROSES DATA
print("Memulai pemrosesan data...")
for year, filename in files_dict.items():
    if not os.path.exists(filename): continue
    
    df = pd.read_csv(filename)
    df.columns = [c.lower() for c in df.columns]
    
    # Identifikasi kolom lokasi dan kategori yang berbeda tiap tahun
    st_col = 'stasiun' if 'stasiun' in df.columns else 'lokasi_spku'
    cat_col = 'kategori' if 'kategori' in df.columns else 'categori'
    
    # Penanganan khusus tahun 2024-2025 (tanggal dipisah)
    if year >= 2024:
        df['dt_final'] = pd.to_datetime(df['periode_data'].astype(str) + 
                                        df['tanggal'].astype(str).str.zfill(2), 
                                        format='%Y%m%d', errors='coerce')
    else:
        df['dt_final'] = df['tanggal'].apply(parse_date)
    
    # Standardisasi hasil
    df['st_unified'] = df[st_col].apply(clean_stasiun)
    df['cat_unified'] = df[cat_col].astype(str).str.upper().str.strip()
    
    all_data.append(df[['dt_final', 'st_unified', 'cat_unified']])

# 4. PENGGABUNGAN & FEATURE ENGINEERING
master = pd.concat(all_data).dropna(subset=['dt_final'])
master = master[(master['cat_unified'] != 'TIDAK ADA DATA') & (master['st_unified'] != 'UNKNOWN')]

master['month'] = master['dt_final'].dt.month
master['day'] = master['dt_final'].dt.day
master['day_of_week'] = master['dt_final'].dt.dayofweek

# Label Encoding untuk Stasiun dan Kategori
le_st = LabelEncoder()
master['st_enc'] = le_st.fit_transform(master['st_unified'])
le_cat = LabelEncoder()
master['cat_enc'] = le_cat.fit_transform(master['cat_unified'])

# 5. PELATIHAN MODEL
print("Melatih model machine learning...")
X = master[['st_enc', 'month', 'day', 'day_of_week']]
y = master['cat_enc']

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# 6. PREDIKSI UNTUK SUBMISSION
print("Membuat prediksi...")
sub = pd.read_csv("sample_submission.csv") # Pastikan file ini ada di direktori utama
sub['tanggal_part'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['st_part'] = sub['id'].str.split('_').str[1].apply(clean_stasiun)

sub['month'] = sub['tanggal_part'].dt.month
sub['day'] = sub['tanggal_part'].dt.day
sub['day_of_week'] = sub['tanggal_part'].dt.dayofweek
sub['st_enc'] = le_st.transform(sub['st_part'])

X_test = sub[['st_enc', 'month', 'day', 'day_of_week']]
preds = model.predict(X_test)
sub['category'] = le_cat.inverse_transform(preds)

# 7. SIMPAN HASIL AKHIR
sub[['id', 'category']].to_csv("submission_final_RandomForest.csv", index=False)
print("Selesai! File 'submission_final.csv' telah berhasil dibuat.")

Memulai pemrosesan data...


  return pd.to_datetime(x)
  return pd.to_datetime(x)


Melatih model machine learning...
Membuat prediksi...
Selesai! File 'submission_final.csv' telah berhasil dibuat.


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# 1. Bagi data master menjadi Training dan Validation set (80:20)
# Menggunakan X dan y yang sudah didefinisikan di cell sebelumnya
X_train_eval, X_val, y_train_eval, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Latih model khusus untuk evaluasi
clf_eval = RandomForestClassifier(n_estimators=100, random_state=42)
clf_eval.fit(X_train_eval, y_train_eval)

# 3. Lakukan prediksi pada data validasi
y_pred_val = clf_eval.predict(X_val)

# 4. Hitung Macro F1-Score
macro_f1 = f1_score(y_val, y_pred_val, average='macro')

# 5. Tampilkan Hasil
print(f"--- Evaluasi Model Random Forest ---")
print(f"Macro F1-Score: {macro_f1:.4f}")
print("\nClassification Report:")

# Menggunakan le_cat (sesuai dengan kode lengkap di cell sebelumnya)
present_classes = np.unique(np.concatenate([y_val, y_pred_val]))
present_names = le_cat.inverse_transform(present_classes)

print(classification_report(y_val, y_pred_val, labels=present_classes, target_names=present_names))

--- Evaluasi Model Random Forest ---
Macro F1-Score: 0.2808

Classification Report:
                    precision    recall  f1-score   support

              BAIK       0.27      0.26      0.27       441
SANGAT TIDAK SEHAT       0.00      0.00      0.00        39
            SEDANG       0.68      0.76      0.72      2079
       TIDAK SEHAT       0.18      0.11      0.14       524

          accuracy                           0.57      3083
         macro avg       0.28      0.28      0.28      3083
      weighted avg       0.53      0.57      0.55      3083



In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# --- BAGIAN 1: EVALUASI (UNTUK MELIHAT SKOR) ---

# 1. Menyiapkan Fitur Cyclical (Sin/Cos) pada Data Master
master['month_sin'] = np.sin(2 * np.pi * master['month']/12)
master['month_cos'] = np.cos(2 * np.pi * master['month']/12)

# 2. Update Fitur dan Target
features_eval = ['st_enc', 'month_sin', 'month_cos', 'day', 'day_of_week']
X_final_eval = master[features_eval]
y_final_eval = master['cat_enc']

# 3. Bagi data menjadi Train dan Validation set (80:20)
X_train_knn, X_val_knn, y_train_knn, y_val_knn = train_test_split(X_final_eval, y_final_eval, test_size=0.2, random_state=42)

# 4. Scaling
scaler_eval = StandardScaler()
X_train_scaled = scaler_eval.fit_transform(X_train_knn)
X_val_scaled = scaler_eval.transform(X_val_knn)

# 5. Latih Model untuk Evaluasi
knn_eval = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan')
knn_eval.fit(X_train_scaled, y_train_knn)

# 6. Prediksi & Tampilkan Skor
y_pred_knn = knn_eval.predict(X_val_scaled)
macro_f1_knn = f1_score(y_val_knn, y_pred_knn, average='macro')

print(f"--- Skor Model KNN Optimized ---")
print(f"Macro F1-Score: {macro_f1_knn:.4f}")
print("\nClassification Report:")
present_classes = np.unique(np.concatenate([y_val_knn, y_pred_knn]))
present_names = le_cat.inverse_transform(present_classes)
print(classification_report(y_val_knn, y_pred_knn, labels=present_classes, target_names=present_names))


# --- BAGIAN 2: MEMBUAT FILE SUBMISSION (MENGGUNAKAN SELURUH DATA) ---

print("\n--- Membuat File Submission ---")

# 1. Latih ulang model menggunakan SELURUH data agar lebih akurat
scaler_final = StandardScaler()
X_master_scaled = scaler_final.fit_transform(X_final_eval)
knn_final = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan')
knn_final.fit(X_master_scaled, y_final_eval)

# 2. Baca Sample Submission
sub = pd.read_csv("sample_submission.csv")

# 3. Preprocessing Data Test (Ekstraksi Fitur dari ID)
sub['tanggal_dt'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['stasiun_nm'] = sub['id'].str.split('_').str[1]

sub['month'] = sub['tanggal_dt'].dt.month
sub['day'] = sub['tanggal_dt'].dt.day
sub['day_of_week'] = sub['tanggal_dt'].dt.dayofweek
sub['month_sin'] = np.sin(2 * np.pi * sub['month']/12)
sub['month_cos'] = np.cos(2 * np.pi * sub['month']/12)

# Gunakan fungsi clean_stasiun yang sudah kita buat sebelumnya untuk encoding
sub['st_enc'] = le_st.transform(sub['stasiun_nm'].apply(clean_stasiun))

# 4. Scaling Data Test
X_test_final = sub[features_eval]
X_test_scaled = scaler_final.transform(X_test_final)

# 5. Prediksi Final
preds_final = knn_final.predict(X_test_scaled)
sub['category'] = le_cat.inverse_transform(preds_final)

# 6. Simpan ke CSV
sub[['id', 'category']].to_csv("submission_final_knn.csv", index=False)
print("Selesai! File 'submission_final_knn.csv' telah berhasil dibuat.")

--- Skor Model KNN Optimized ---
Macro F1-Score: 0.2838

Classification Report:
                    precision    recall  f1-score   support

              BAIK       0.27      0.26      0.27       441
SANGAT TIDAK SEHAT       0.00      0.00      0.00        39
            SEDANG       0.68      0.77      0.72      2079
       TIDAK SEHAT       0.21      0.11      0.15       524

          accuracy                           0.58      3083
         macro avg       0.29      0.29      0.28      3083
      weighted avg       0.53      0.58      0.55      3083


--- Membuat File Submission ---
Selesai! File 'submission_final_knn.csv' telah berhasil dibuat.


In [33]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import os

# 1. MEMANGGIL DATA (Sesuai instruksi Anda, dengan file 2025 terbaru)
print("Memuat data...")
ISPU2010 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv")
ISPU2011 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-komponen-data.csv")
ISPU2012 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv")
ISPU2013 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2013-komponen-data.csv")
ISPU2014 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv")
ISPU2015 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv")
ISPU2016 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv")
ISPU2017 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv")
ISPU2018 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv")
ISPU2019 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2019-komponen-data.csv")
ISPU2020 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2020-komponen-data.csv")
ISPU2021 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2021-komponen-data.csv")
ISPU2022 = pd.read_csv("ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2022-komponen-data.csv")
ISPU2023 = pd.read_csv("ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv")
ISPU2024 = pd.read_csv("ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv")
# Menggunakan file terbaru yang Anda berikan
ISPU2025 = pd.read_csv("data_ispu_2025.csv") 

# 2. FUNGSI PREPROCESSING UNTUK MENYAMAKAN SKEMA
def clean_stasiun(s):
    s = str(s).upper()
    for code in ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']:
        if code in s: return code
    return 'UNKNOWN'

def parse_date(row, year):
    try:
        # Penanganan khusus 2024-2025
        if year >= 2024:
            return pd.to_datetime(str(row['periode_data']) + str(int(row['tanggal'])).zfill(2), format='%Y%m%d')
        # Penanganan angka serial excel (2022)
        val = row['tanggal']
        if isinstance(val, (int, float)) or (isinstance(val, str) and val.replace('.','').isdigit()):
            return pd.to_datetime('1899-12-30') + pd.to_timedelta(float(val), unit='D')
        return pd.to_datetime(val)
    except: return pd.NaT

# Daftar dataframe dan tahunnya
df_list = [
    (ISPU2010, 2010), (ISPU2011, 2011), (ISPU2012, 2012), (ISPU2013, 2013),
    (ISPU2014, 2014), (ISPU2015, 2015), (ISPU2016, 2016), (ISPU2017, 2017),
    (ISPU2018, 2018), (ISPU2019, 2019), (ISPU2020, 2020), (ISPU2021, 2021),
    (ISPU2022, 2022), (ISPU2023, 2023), (ISPU2024, 2024), (ISPU2025, 2025)
]

processed_dfs = []
for df, year in df_list:
    df.columns = [c.lower() for c in df.columns]
    st_col = 'stasiun' if 'stasiun' in df.columns else 'lokasi_spku'
    cat_col = 'kategori' if 'kategori' in df.columns else 'categori'
    
    df['dt_final'] = df.apply(lambda r: parse_date(r, year), axis=1)
    df['st_unified'] = df[st_col].apply(clean_stasiun)
    df['cat_unified'] = df[cat_col].astype(str).str.upper().str.strip()
    
    processed_dfs.append(df[['dt_final', 'st_unified', 'cat_unified']])

# 3. GABUNG DAN FEATURE ENGINEERING
master = pd.concat(processed_dfs).dropna(subset=['dt_final'])
master = master[(master['cat_unified'] != 'TIDAK ADA DATA') & (master['st_unified'] != 'UNKNOWN')]

# Ekstraksi Fitur Waktu & Cyclical Encoding
master['month'] = master['dt_final'].dt.month
master['day'] = master['dt_final'].dt.day
master['day_of_week'] = master['dt_final'].dt.dayofweek
master['month_sin'] = np.sin(2 * np.pi * master['month']/12)
master['month_cos'] = np.cos(2 * np.pi * master['month']/12)

# Encoding Label
le_st = LabelEncoder()
master['st_enc'] = le_st.fit_transform(master['st_unified'])
le_cat = LabelEncoder()
master['cat_enc'] = le_cat.fit_transform(master['cat_unified'])

# 4. EVALUASI MODEL (MACRO F1-SCORE)
features = ['st_enc', 'month_sin', 'month_cos', 'day', 'day_of_week']
X = master[features]
y = master['cat_enc']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Menggunakan KNN sesuai referensi jurnal
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan')
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_val_scaled)
print(f"\n--- EVALUASI PERFORMA MODEL ---")
print(f"Macro F1-Score: {f1_score(y_val, y_pred, average='macro'):.4f}")

# Menampilkan laporan klasifikasi yang aman dari error label
unique_labels = np.unique(np.concatenate([y_val, y_pred]))
print(classification_report(y_val, y_pred, labels=unique_labels, target_names=le_cat.inverse_transform(unique_labels)))

# 5. MEMBUAT FILE SUBMISSION FINAL
print("\nMembuat file submission...")
# Latih ulang pada 100% data untuk hasil terbaik
scaler_final = StandardScaler()
X_master_scaled = scaler_final.fit_transform(X)
knn.fit(X_master_scaled, y)

sub = pd.read_csv("sample_submission.csv")
sub['tanggal_dt'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['stasiun_nm'] = sub['id'].str.split('_').str[1]

sub['month'] = sub['tanggal_dt'].dt.month
sub['day'] = sub['tanggal_dt'].dt.day
sub['day_of_week'] = sub['tanggal_dt'].dt.dayofweek
sub['month_sin'] = np.sin(2 * np.pi * sub['month']/12)
sub['month_cos'] = np.cos(2 * np.pi * sub['month']/12)
sub['st_enc'] = le_st.transform(sub['stasiun_nm'].apply(clean_stasiun))

X_test_scaled = scaler_final.transform(sub[features])
sub['category'] = le_cat.inverse_transform(knn.predict(X_test_scaled))

sub[['id', 'category']].to_csv("submission_final_knn2.csv", index=False)
print("Selesai! File 'submission_final_2025.csv' telah siap.")

Memuat data...


  return pd.to_datetime(val)
  return pd.to_datetime(val)



--- EVALUASI PERFORMA MODEL ---
Macro F1-Score: 0.2943
                    precision    recall  f1-score   support

              BAIK       0.32      0.28      0.30       492
SANGAT TIDAK SEHAT       0.00      0.00      0.00        42
            SEDANG       0.70      0.77      0.73      2176
       TIDAK SEHAT       0.18      0.13      0.15       463

          accuracy                           0.59      3173
         macro avg       0.30      0.29      0.29      3173
      weighted avg       0.55      0.59      0.57      3173


Membuat file submission...
Selesai! File 'submission_final_2025.csv' telah siap.


In [34]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import os

# 1. DAFTAR FILE DAN PEMUATAN DATA
files_dict = {
    2010: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv",
    2011: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-komponen-data.csv",
    2012: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv",
    2013: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2013-komponen-data.csv",
    2014: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv",
    2015: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv",
    2016: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv",
    2017: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv",
    2018: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv",
    2019: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2019-komponen-data.csv",
    2020: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2020-komponen-data.csv",
    2021: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2021-komponen-data.csv",
    2022: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2022-komponen-data.csv",
    2023: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv",
    2024: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv",
    2025: "data_ispu_2025.csv"
}

pollutant_cols = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida']
rename_map = {'pm10': 'pm_sepuluh', 'pm_10': 'pm_sepuluh', 'pm25': 'pm_duakomalima', 'pm_25': 'pm_duakomalima',
              'so2': 'sulfur_dioksida', 'co': 'karbon_monoksida', 'o3': 'ozon', 'no2': 'nitrogen_dioksida', 'lokasi_spku': 'stasiun'}

def clean_stasiun(s):
    s = str(s).upper()
    for code in ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']:
        if code in s: return code
    return 'UNKNOWN'

def parse_date(row, year):
    try:
        if year >= 2024: return pd.to_datetime(str(row['periode_data']) + str(int(row['tanggal'])).zfill(2), format='%Y%m%d')
        val = row['tanggal']
        if isinstance(val, (int, float)) or (isinstance(val, str) and val.replace('.','').isdigit()):
            return pd.to_datetime('1899-12-30') + pd.to_timedelta(float(val), unit='D')
        return pd.to_datetime(val)
    except: return pd.NaT

# 2. GABUNG MASTER DATA
all_data = []
for year, filename in files_dict.items():
    if not os.path.exists(filename): continue
    df = pd.read_csv(filename).rename(columns=str.lower).rename(columns=rename_map)
    df['dt_final'] = df.apply(lambda r: parse_date(r, year), axis=1)
    df['st_unified'] = df['stasiun'].apply(clean_stasiun)
    cat_col = 'kategori' if 'kategori' in df.columns else 'categori'
    df['cat_unified'] = df[cat_col].astype(str).str.upper().str.strip()
    
    for c in pollutant_cols:
        if c not in df.columns: df[c] = np.nan
        else: df[c] = pd.to_numeric(df[c].astype(str).str.replace(',', '.').replace(['---', '-', '', 'N/A', 'NAN'], np.nan), errors='coerce')
    all_data.append(df[['dt_final', 'st_unified', 'cat_unified'] + pollutant_cols])

master = pd.concat(all_data, ignore_index=True).dropna(subset=['dt_final'])
master = master[~master['cat_unified'].isin(['TIDAK ADA DATA', 'NAN', '', 'NONE'])]
master = master[master['st_unified'] != 'UNKNOWN']

# --- SOLUSI VALUE ERROR (Urutan Label) & STRATIFY ERROR ---
# Hapus kategori yang jumlahnya terlalu sedikit (< 2) agar bisa di-split (stratify)
master = master[master.groupby('cat_unified')['cat_unified'].transform('count') > 1]

# Re-Encoding agar label urut 0, 1, 2, 3... (Wajib untuk XGBoost)
le_st = LabelEncoder()
master['st_enc'] = le_st.fit_transform(master['st_unified'])
le_cat = LabelEncoder()
master['cat_enc'] = le_cat.fit_transform(master['cat_unified'])

# 3. TRAINING XGBOOST
features = ['st_enc'] + pollutant_cols
X = master[features]
y = master['cat_enc']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# EVALUASI
y_pred = xgb_model.predict(X_val)
print(f"Macro F1-Score: {f1_score(y_val, y_pred, average='macro'):.4f}")
print(classification_report(y_val, y_pred, target_names=le_cat.classes_))

# 4. SUBMISSION
sub = pd.read_csv("sample_submission.csv")
sub['dt'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['st_unified'] = sub['id'].str.split('_').str[1].apply(clean_stasiun)

# Merge dengan data 2025 untuk ambil polutan
df_2025_processed = all_data[-1]
sub_joined = sub.merge(df_2025_processed, left_on=['dt', 'st_unified'], right_on=['dt_final', 'st_unified'], how='left')
sub_joined['st_enc'] = le_st.transform(sub_joined['st_unified'])

sub['category'] = le_cat.inverse_transform(xgb_model.predict(sub_joined[features]))
sub[['id', 'category']].to_csv("submission_xgb_final.csv", index=False)
print("Selesai! File 'submission_xgb_final.csv' telah dibuat.")

  return pd.to_datetime(val)
  return pd.to_datetime(val)


Macro F1-Score: 0.9704
                    precision    recall  f1-score   support

              BAIK       1.00      1.00      1.00       481
SANGAT TIDAK SEHAT       0.92      0.90      0.91        40
            SEDANG       0.99      1.00      1.00      2163
       TIDAK SEHAT       0.98      0.97      0.98       489

          accuracy                           0.99      3173
         macro avg       0.97      0.97      0.97      3173
      weighted avg       0.99      0.99      0.99      3173

Selesai! File 'submission_xgb_final.csv' telah dibuat.


In [39]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import os

# 1. KONFIGURASI FILE
files_dict = {
    2010: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv",
    2011: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-komponen-data.csv",
    2012: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv",
    2013: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2013-komponen-data.csv",
    2014: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv",
    2015: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv",
    2016: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv",
    2017: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv",
    2018: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv",
    2019: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2019-komponen-data.csv",
    2020: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2020-komponen-data.csv",
    2021: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2021-komponen-data.csv",
    2022: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2022-komponen-data.csv",
    2023: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv",
    2024: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv",
    2025: "data_ispu_2025.csv"
}

pollutant_cols = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida']
rename_map = {'pm10': 'pm_sepuluh', 'pm_10': 'pm_sepuluh', 'pm25': 'pm_duakomalima', 'pm_25': 'pm_duakomalima',
              'so2': 'sulfur_dioksida', 'co': 'karbon_monoksida', 'o3': 'ozon', 'no2': 'nitrogen_dioksida', 'lokasi_spku': 'stasiun'}

def clean_stasiun(s):
    s = str(s).upper()
    for code in ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']:
        if code in s: return code
    return 'UNKNOWN'

def parse_date(row, year):
    try:
        if year >= 2024:
            return pd.to_datetime(str(int(row['periode_data'])) + str(int(row['tanggal'])).zfill(2), format='%Y%m%d')
        val = row['tanggal']
        if isinstance(val, (int, float)) or (isinstance(val, str) and str(val).replace('.','').isdigit()):
            return pd.to_datetime('1899-12-30') + pd.to_timedelta(float(val), unit='D')
        return pd.to_datetime(val)
    except: return pd.NaT

# 2. PEMUATAN DATA (PASTIKAN dt_final TERBENTUK)
all_data_list = []
print("Memproses file...")
for year, filename in files_dict.items():
    if not os.path.exists(filename): continue
    df = pd.read_csv(filename).rename(columns=str.lower).rename(columns=rename_map)
    df['dt_final'] = df.apply(lambda r: parse_date(r, year), axis=1)
    df['st_unified'] = df['stasiun'].apply(clean_stasiun)
    cat_col = 'kategori' if 'kategori' in df.columns else 'categori'
    df['cat_unified'] = df[cat_col].astype(str).str.upper().str.strip()
    
    for c in pollutant_cols:
        if c not in df.columns: df[c] = np.nan
        else: df[c] = pd.to_numeric(df[c].astype(str).str.replace(',', '.').replace(['---', '-', '', 'N/A', 'NAN'], np.nan), errors='coerce')
    all_data_list.append(df[['dt_final', 'st_unified', 'cat_unified'] + pollutant_cols])

master = pd.concat(all_data_list, ignore_index=True).dropna(subset=['dt_final'])
master = master[~master['cat_unified'].isin(['TIDAK ADA DATA', 'NAN', '', 'NONE'])]
master = master[master['st_unified'] != 'UNKNOWN']

# 3. FITUR LAG (DATA KEMARIN)
print("Membuat fitur lag...")
master = master.sort_values(by=['st_unified', 'dt_final'])
for col in pollutant_cols:
    master[f'{col}_lag1'] = master.groupby('st_unified')[col].shift(1)

master_lag = master.dropna(subset=[f'{c}_lag1' for c in pollutant_cols]).copy()

# 4. FILTER KELAS & ENCODING (SOLUSI VALUEERROR)
# Pastikan setiap kategori minimal punya 2 data agar bisa dibagi ke Train/Val
cat_counts = master_lag['cat_unified'].value_counts()
valid_cats = cat_counts[cat_counts >= 2].index
master_lag = master_lag[master_lag['cat_unified'].isin(valid_cats)]

le_st = LabelEncoder()
master_lag['st_enc'] = le_st.fit_transform(master_lag['st_unified'])
le_cat = LabelEncoder()
master_lag['cat_enc'] = le_cat.fit_transform(master_lag['cat_unified'])

# 5. TRAINING
lag_features = ['st_enc'] + [f'{c}_lag1' for c in pollutant_cols]
X = master_lag[lag_features]
y = master_lag['cat_enc']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = HistGradientBoostingClassifier(random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# 6. EVALUASI (SOLUSI MISMATCH TARGET_NAMES)
y_pred = clf.predict(X_val)
print(f"\nMacro F1-Score: {f1_score(y_val, y_pred, average='macro'):.4f}")

# Teknik Agar Classification Report Tidak Error:
# Gunakan parameter 'labels' untuk mendefinisikan semua kelas yang seharusnya ada
print("\nClassification Report:")
print(classification_report(
    y_val, 
    y_pred, 
    labels=np.arange(len(le_cat.classes_)), 
    target_names=le_cat.classes_
))

# 7. SUBMISSION
sub = pd.read_csv("sample_submission.csv")
sub['dt'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['st_unified'] = sub['id'].str.split('_').str[1].apply(clean_stasiun)
sub['dt_yesterday'] = sub['dt'] - pd.Timedelta(days=1)

sub_joined = sub.merge(master[['dt_final', 'st_unified'] + pollutant_cols], 
                       left_on=['dt_yesterday', 'st_unified'], 
                       right_on=['dt_final', 'st_unified'], how='left')

for c in pollutant_cols:
    sub_joined[f'{c}_lag1'] = sub_joined[c]

sub_joined['st_enc'] = le_st.transform(sub_joined['st_unified'])
sub['category'] = le_cat.inverse_transform(clf.predict(sub_joined[lag_features]))

sub[['id', 'category']].to_csv("submission_xgb_final2.csv", index=False)
print("\nSukses! File 'submission_final_fixed.csv' telah dibuat.")

Memproses file...


  return pd.to_datetime(val)
  return pd.to_datetime(val)


Membuat fitur lag...

Macro F1-Score: 0.5838

Classification Report:
                    precision    recall  f1-score   support

              BAIK       0.44      0.59      0.50       122
SANGAT TIDAK SEHAT       0.00      0.00      0.00         0
            SEDANG       0.87      0.74      0.80       978
       TIDAK SEHAT       0.36      0.59      0.45       159

          accuracy                           0.71      1259
         macro avg       0.42      0.48      0.44      1259
      weighted avg       0.76      0.71      0.73      1259


Sukses! File 'submission_final_fixed.csv' telah dibuat.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [41]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import os

# ==========================================
# 1. KONFIGURASI FILE & FUNGSI HELPER
# ==========================================
files_dict = {
    2010: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv",
    2011: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-komponen-data.csv",
    2012: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv",
    2013: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2013-komponen-data.csv",
    2014: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv",
    2015: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv",
    2016: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv",
    2017: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv",
    2018: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv",
    2019: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2019-komponen-data.csv",
    2020: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2020-komponen-data.csv",
    2021: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2021-komponen-data.csv",
    2022: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2022-komponen-data.csv",
    2023: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv",
    2024: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv",
    2025: "data_ispu_2025.csv"
}

pollutant_cols = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida']
rename_map = {'pm10': 'pm_sepuluh', 'pm_10': 'pm_sepuluh', 'pm25': 'pm_duakomalima', 'pm_25': 'pm_duakomalima',
              'so2': 'sulfur_dioksida', 'co': 'karbon_monoksida', 'o3': 'ozon', 'no2': 'nitrogen_dioksida', 'lokasi_spku': 'stasiun'}

def clean_stasiun(s):
    s = str(s).upper()
    for code in ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']:
        if code in s: return code
    return 'UNKNOWN'

def parse_date(row, year):
    try:
        if year >= 2024:
            return pd.to_datetime(str(int(row['periode_data'])) + str(int(row['tanggal'])).zfill(2), format='%Y%m%d')
        val = row['tanggal']
        if isinstance(val, (int, float)) or (isinstance(val, str) and str(val).replace('.','').isdigit()):
            return pd.to_datetime('1899-12-30') + pd.to_timedelta(float(val), unit='D')
        return pd.to_datetime(val)
    except: return pd.NaT

# ==========================================
# 2. PEMUATAN DATA & PEMBERSIHAN
# ==========================================
all_data_list = []
print("1/5 Memproses file ISPU 2010-2025...")

for year, filename in files_dict.items():
    if not os.path.exists(filename): continue
    df = pd.read_csv(filename).rename(columns=str.lower).rename(columns=rename_map)
    df['dt_final'] = df.apply(lambda r: parse_date(r, year), axis=1)
    df['st_unified'] = df['stasiun'].apply(clean_stasiun)
    
    # Cari kolom kategori yang benar
    cat_col = next((c for c in ['kategori', 'categori'] if c in df.columns), None)
    df['cat_unified'] = df[cat_col].astype(str).str.upper().str.strip() if cat_col else 'UNKNOWN'
    
    for c in pollutant_cols:
        if c not in df.columns: df[c] = np.nan
        else: df[c] = pd.to_numeric(df[c].astype(str).str.replace(',', '.').replace(['---', '-', '', 'N/A', 'NAN'], np.nan), errors='coerce')
    
    all_data_list.append(df[['dt_final', 'st_unified', 'cat_unified'] + pollutant_cols])

master = pd.concat(all_data_list, ignore_index=True).dropna(subset=['dt_final'])
master = master[~master['cat_unified'].isin(['TIDAK ADA DATA', 'NAN', '', 'NONE', 'UNKNOWN'])]
master = master[master['st_unified'] != 'UNKNOWN']

# ==========================================
# 3. ADVANCED FEATURE ENGINEERING (LAG & ROLLING)
# ==========================================
print("2/5 Membuat fitur Lag & Rolling...")
master = master.sort_values(by=['st_unified', 'dt_final'])

for col in pollutant_cols:
    # Lag 1, 2, 3 (Data 1-3 hari sebelumnya)
    for i in [1, 2, 3]:
        master[f'{col}_lag{i}'] = master.groupby('st_unified')[col].shift(i)
    # Rolling Mean (Tren 3 dan 7 hari terakhir)
    master[f'{col}_roll3'] = master.groupby('st_unified')[col].transform(lambda x: x.shift(1).rolling(3).mean())
    master[f'{col}_roll7'] = master.groupby('st_unified')[col].transform(lambda x: x.shift(1).rolling(7).mean())

# Fitur Waktu
master['month'] = master['dt_final'].dt.month
master['day_of_week'] = master['dt_final'].dt.dayofweek
master['is_weekend'] = master['day_of_week'].isin([5, 6]).astype(int)

# Hapus baris yang fitur lag-nya kosong
master_feat = master.dropna(subset=[f'{c}_lag1' for c in pollutant_cols]).copy()

# Encoding Stasiun (st_enc) - Dilakukan DI SINI agar tidak ada KeyError
le_st = LabelEncoder()
master_feat['st_enc'] = le_st.fit_transform(master_feat['st_unified'])

# Filter kelas langka agar stratify tidak error
cat_counts = master_feat['cat_unified'].value_counts()
valid_cats = cat_counts[cat_counts >= 2].index
master_feat = master_feat[master_feat['cat_unified'].isin(valid_cats)]

# Label Encoding Kategori (Urut 0, 1, 2, 3...)
le_cat = LabelEncoder()
master_feat['cat_enc'] = le_cat.fit_transform(master_feat['cat_unified'])

# ==========================================
# 4. PELATIHAN MODEL
# ==========================================
lag_cols = [f'{c}_lag{i}' for c in pollutant_cols for i in [1, 2, 3]]
roll_cols = [f'{c}_roll{w}' for c in pollutant_cols for w in [3, 7]]
features_final = ['st_enc', 'month', 'day_of_week', 'is_weekend'] + lag_cols + roll_cols

X = master_feat[features_final]
y = master_feat['cat_enc']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("3/5 Melatih model HistGradientBoosting...")
clf = HistGradientBoostingClassifier(random_state=42, class_weight='balanced', max_iter=300)
clf.fit(X_train, y_train)

# Evaluasi
y_pred = clf.predict(X_val)
print(f"\nMacro F1-Score: {f1_score(y_val, y_pred, average='macro'):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred, labels=np.arange(len(le_cat.classes_)), target_names=le_cat.classes_))

# ==========================================
# 5. GENERATE SUBMISSION (SINKRONISASI FITUR)
# ==========================================
print("\n4/5 Sinkronisasi fitur untuk submission...")
sub = pd.read_csv("sample_submission.csv")
sub['dt'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['st_unified'] = sub['id'].str.split('_').str[1].apply(clean_stasiun)

# Fitur Kalender & Encoding Stasiun
sub['month'] = sub['dt'].dt.month
sub['day_of_week'] = sub['dt'].dt.dayofweek
sub['is_weekend'] = sub['day_of_week'].isin([5, 6]).astype(int)
sub['st_enc'] = le_st.transform(sub['st_unified'])

# Loop context window untuk mengambil lag dari data master
sub_list = []
for _, row in sub.iterrows():
    # Ambil 7 hari terakhir dari master SEBELUM tanggal target
    ctx = master[(master['st_unified'] == row['st_unified']) & (master['dt_final'] < row['dt'])].tail(7)
    f_dict = {'id': row['id']}
    for col in pollutant_cols:
        f_dict[f'{col}_lag1'] = ctx[col].iloc[-1] if len(ctx) >= 1 else np.nan
        f_dict[f'{col}_lag2'] = ctx[col].iloc[-2] if len(ctx) >= 2 else np.nan
        f_dict[f'{col}_lag3'] = ctx[col].iloc[-3] if len(ctx) >= 3 else np.nan
        f_dict[f'{col}_roll3'] = ctx[col].tail(3).mean() if len(ctx) >= 3 else np.nan
        f_dict[f'{col}_roll7'] = ctx[col].tail(7).mean() if len(ctx) >= 7 else np.nan
    sub_list.append(f_dict)

sub_final_feat = sub.merge(pd.DataFrame(sub_list), on='id')
sub['category'] = le_cat.inverse_transform(clf.predict(sub_final_feat[features_final]))

# Save
sub[['id', 'category']].to_csv("submission_xgb_final3.csv", index=False)
print("5/5 Selesai! File 'submission_advanced_vfinal.csv' telah siap.")

1/5 Memproses file ISPU 2010-2025...


  return pd.to_datetime(val)
  return pd.to_datetime(val)


2/5 Membuat fitur Lag & Rolling...
3/5 Melatih model HistGradientBoosting...

Macro F1-Score: 0.4500

Classification Report:
                    precision    recall  f1-score   support

              BAIK       0.54      0.48      0.51       122
SANGAT TIDAK SEHAT       0.00      0.00      0.00         0
            SEDANG       0.84      0.88      0.86       978
       TIDAK SEHAT       0.50      0.38      0.43       159

          accuracy                           0.78      1259
         macro avg       0.47      0.44      0.45      1259
      weighted avg       0.77      0.78      0.77      1259


4/5 Sinkronisasi fitur untuk submission...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


5/5 Selesai! File 'submission_advanced_vfinal.csv' telah siap.


In [43]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import os

# ==========================================
# 1. SETUP & DATA LOADING (Re-build Master)
# ==========================================
files_dict = {
    2010: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv",
    2011: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-komponen-data.csv",
    2012: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv",
    2013: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2013-komponen-data.csv",
    2014: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv",
    2015: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv",
    2016: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv",
    2017: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv",
    2018: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv",
    2019: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2019-komponen-data.csv",
    2020: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2020-komponen-data.csv",
    2021: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2021-komponen-data.csv",
    2022: "ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2022-komponen-data.csv",
    2023: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv",
    2024: "ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv",
    2025: "data_ispu_2025.csv"
}

pollutant_cols = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida']
rename_map = {'pm10': 'pm_sepuluh', 'pm_10': 'pm_sepuluh', 'pm25': 'pm_duakomalima', 'pm_25': 'pm_duakomalima',
              'so2': 'sulfur_dioksida', 'co': 'karbon_monoksida', 'o3': 'ozon', 'no2': 'nitrogen_dioksida', 'lokasi_spku': 'stasiun'}

def clean_stasiun(s):
    s = str(s).upper()
    for code in ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']:
        if code in s: return code
    return 'UNKNOWN'

def parse_date(row, year):
    try:
        if year >= 2024:
            return pd.to_datetime(str(int(row['periode_data'])) + str(int(row['tanggal'])).zfill(2), format='%Y%m%d')
        val = row['tanggal']
        if isinstance(val, (int, float)) or (isinstance(val, str) and str(val).replace('.','').isdigit()):
            return pd.to_datetime('1899-12-30') + pd.to_timedelta(float(val), unit='D')
        return pd.to_datetime(val)
    except: return pd.NaT

all_data = []
print("Memuat data historis sesuai struktur jurnal...")
for year, filename in files_dict.items():
    if not os.path.exists(filename): continue
    df = pd.read_csv(filename).rename(columns=str.lower).rename(columns=rename_map)
    df['dt_final'] = df.apply(lambda r: parse_date(r, year), axis=1)
    df['st_unified'] = df['stasiun'].apply(clean_stasiun)
    cat_col = next((c for c in ['kategori', 'categori'] if c in df.columns), None)
    df['cat_unified'] = df[cat_col].astype(str).str.upper().str.strip() if cat_col else 'UNKNOWN'
    
    for c in pollutant_cols:
        if c not in df.columns: df[c] = np.nan
        else: df[c] = pd.to_numeric(df[c].astype(str).str.replace(',', '.').replace(['---', '-', '', 'N/A', 'NAN'], np.nan), errors='coerce')
    all_data.append(df[['dt_final', 'st_unified', 'cat_unified'] + pollutant_cols])

master = pd.concat(all_data, ignore_index=True).dropna(subset=['dt_final'])
master = master[~master['cat_unified'].isin(['TIDAK ADA DATA', 'NAN', '', 'NONE', 'UNKNOWN'])]
master = master[master['st_unified'] != 'UNKNOWN']

# ==========================================
# 2. FEATURE ENGINEERING (LAGGED POLLUTANTS)
# ==========================================
print("Membuat fitur Lag polutan (H-1)...")
master = master.sort_values(by=['st_unified', 'dt_final'])
for col in pollutant_cols:
    master[f'{col}_lag1'] = master.groupby('st_unified')[col].shift(1)

master_lag = master.dropna(subset=[f'{c}_lag1' for c in pollutant_cols]).copy()

# Filter kelas minimal 2 sampel (untuk stratify)
cat_counts = master_lag['cat_unified'].value_counts()
valid_cats = cat_counts[cat_counts >= 2].index
master_lag = master_lag[master_lag['cat_unified'].isin(valid_cats)]

# Label Encoding
le_st = LabelEncoder()
master_lag['st_enc'] = le_st.fit_transform(master_lag['st_unified'])
le_cat = LabelEncoder()
master_lag['cat_enc'] = le_cat.fit_transform(master_lag['cat_unified'])

# ==========================================
# 3. TRAINING (HIST-GRADIENT BOOSTING)
# ==========================================
# Fitur: Stasiun + 6 Polutan Utama (Sesuai Jurnal)
features = ['st_enc'] + [f'{c}_lag1' for c in pollutant_cols]
X = master_lag[features]
y = master_lag['cat_enc']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Melatih model (XGBoost logic)...")
clf = HistGradientBoostingClassifier(
    random_state=42, 
    class_weight='balanced', 
    max_iter=300,
    learning_rate=0.05
)
clf.fit(X_train, y_train)

# Evaluasi
y_pred = clf.predict(X_val)
print(f"\nMacro F1-Score: {f1_score(y_val, y_pred, average='macro'):.4f}")
print(classification_report(y_val, y_pred, labels=np.arange(len(le_cat.classes_)), target_names=le_cat.classes_))

# ==========================================
# 4. SUBMISSION GENERATION
# ==========================================
print("\nMembuat file submission...")
sub = pd.read_csv("sample_submission.csv")
sub['dt'] = pd.to_datetime(sub['id'].str.split('_').str[0])
sub['st_unified'] = sub['id'].str.split('_').str[1].apply(clean_stasiun)
sub['dt_yesterday'] = sub['dt'] - pd.Timedelta(days=1)

# Lookup data polutan H-1 dari master data
sub_final = sub.merge(master[['dt_final', 'st_unified'] + pollutant_cols], 
                      left_on=['dt_yesterday', 'st_unified'], 
                      right_on=['dt_final', 'st_unified'], how='left')

# Mapping ke nama kolom fitur
for c in pollutant_cols:
    sub_final[f'{c}_lag1'] = sub_final[c]

sub_final['st_enc'] = le_st.transform(sub_final['st_unified'])
sub['category'] = le_cat.inverse_transform(clf.predict(sub_final[features]))

sub[['id', 'category']].to_csv("submission_xgb_final4.csv", index=False)
print("Selesai! File 'submission_journal_v1.csv' berhasil dibuat.")

Memuat data historis sesuai struktur jurnal...


  return pd.to_datetime(val)
  return pd.to_datetime(val)


Membuat fitur Lag polutan (H-1)...
Melatih model (XGBoost logic)...

Macro F1-Score: 0.5865
                    precision    recall  f1-score   support

              BAIK       0.44      0.55      0.49       122
SANGAT TIDAK SEHAT       0.00      0.00      0.00         0
            SEDANG       0.86      0.78      0.82       978
       TIDAK SEHAT       0.39      0.54      0.45       159

          accuracy                           0.73      1259
         macro avg       0.42      0.47      0.44      1259
      weighted avg       0.76      0.73      0.74      1259


Membuat file submission...
Selesai! File 'submission_journal_v1.csv' berhasil dibuat.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
