# Preprocessing Data ISPU Jakarta 2023-2025

Notebook ini melakukan preprocessing data untuk prediksi kategori kualitas udara di DKI Jakarta.

## Data yang digunakan:
1. **ISPU (Indeks Standar Pencemar Udara)** - Data tahun 2023-2025
2. **Cuaca Harian** - Data temperatur dari 5 stasiun DKI
3. **NDVI (Vegetation Index)** - Indeks vegetasi Jakarta

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Load dan Gabungkan Data ISPU 2023-2025

In [None]:
# Path ke folder data
BASE_PATH = '..'

# Load ISPU data 2023-2025
ispu_2023 = pd.read_csv(f'{BASE_PATH}/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv')
ispu_2024 = pd.read_csv(f'{BASE_PATH}/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv')
ispu_2025 = pd.read_csv(f'{BASE_PATH}/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv')

print("Data ISPU 2023:")
print(f"Columns: {ispu_2023.columns.tolist()}")
print(f"Shape: {ispu_2023.shape}")

print("\nData ISPU 2024:")
print(f"Columns: {ispu_2024.columns.tolist()}")
print(f"Shape: {ispu_2024.shape}")

print("\nData ISPU 2025:")
print(f"Columns: {ispu_2025.columns.tolist()}")
print(f"Shape: {ispu_2025.shape}")

FileNotFoundError: [Errno 2] No such file or directory: '../air-polution-prediction/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv'

In [None]:
# Standardisasi format tanggal dan kolom ISPU 2023 (format berbeda)
def standardize_ispu_2023(df):
    """Standardisasi format data ISPU 2023 yang berbeda dari 2024-2025"""
    df = df.copy()
    # Kolom tanggal sudah dalam format YYYY-MM-DD
    df['tanggal'] = pd.to_datetime(df['tanggal'])
    df['bulan'] = df['tanggal'].dt.month
    
    # Rename kolom untuk konsistensi
    df = df.rename(columns={
        'pm_sepuluh': 'pm_sepuluh',
        'pm_duakomalima': 'pm_duakomalima',
        'sulfur_dioksida': 'sulfur_dioksida',
        'karbon_monoksida': 'karbon_monoksida',
        'ozon': 'ozon',
        'nitrogen_dioksida': 'nitrogen_dioksida'
    })
    
    return df

def standardize_ispu_2024_2025(df, year):
    """Standardisasi format data ISPU 2024-2025"""
    df = df.copy()
    # Buat kolom tanggal dari bulan dan tanggal
    df['tanggal'] = pd.to_datetime(df.apply(
        lambda x: f"{year}-{int(x['bulan']):02d}-{int(x['tanggal']):02d}", axis=1
    ), errors='coerce')
    
    return df

# Proses standardisasi
ispu_2023_std = standardize_ispu_2023(ispu_2023)
ispu_2024_std = standardize_ispu_2024_2025(ispu_2024, 2024)
ispu_2025_std = standardize_ispu_2025_2025 = standardize_ispu_2024_2025(ispu_2025, 2025)

print("Standardized ISPU 2023 tanggal range:", ispu_2023_std['tanggal'].min(), "-", ispu_2023_std['tanggal'].max())
print("Standardized ISPU 2024 tanggal range:", ispu_2024_std['tanggal'].min(), "-", ispu_2024_std['tanggal'].max())
print("Standardized ISPU 2025 tanggal range:", ispu_2025_std['tanggal'].min(), "-", ispu_2025_std['tanggal'].max())

Standardized ISPU 2023 tanggal range: 2022-12-01 00:00:00 - 2023-11-30 00:00:00
Standardized ISPU 2024 tanggal range: 2024-01-01 00:00:00 - 2024-12-31 00:00:00
Standardized ISPU 2025 tanggal range: 2025-01-01 00:00:00 - 2025-08-31 00:00:00


In [None]:
# Ekstrak stasiun_id dari kolom stasiun
def extract_stasiun_id(stasiun_name):
    """Ekstrak ID stasiun (DKI1-DKI5) dari nama stasiun"""
    if pd.isna(stasiun_name):
        return None
    stasiun_str = str(stasiun_name).upper()
    for i in range(1, 6):
        if f'DKI{i}' in stasiun_str:
            return f'DKI{i}'
    return None

# Apply ke semua dataset
for df in [ispu_2023_std, ispu_2024_std, ispu_2025_std]:
    df['stasiun_id'] = df['stasiun'].apply(extract_stasiun_id)

print("Stasiun unik 2023:", ispu_2023_std['stasiun_id'].unique())
print("Stasiun unik 2024:", ispu_2024_std['stasiun_id'].unique())
print("Stasiun unik 2025:", ispu_2025_std['stasiun_id'].unique())

Stasiun unik 2023: <StringArray>
['DKI5', 'DKI1', 'DKI2', 'DKI3', 'DKI4']
Length: 5, dtype: str
Stasiun unik 2024: <StringArray>
['DKI3', 'DKI1', 'DKI2', 'DKI4', 'DKI5']
Length: 5, dtype: str
Stasiun unik 2025: <StringArray>
['DKI5', 'DKI1', 'DKI2', 'DKI3', 'DKI4']
Length: 5, dtype: str


In [None]:
# Gabungkan semua data ISPU
# Pilih kolom yang akan digunakan
cols_to_use = ['tanggal', 'stasiun_id', 'pm_sepuluh', 'pm_duakomalima', 
               'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida',
               'max', 'parameter_pencemar_kritis', 'kategori']

# Gabungkan dataset
ispu_combined = pd.concat([
    ispu_2023_std[cols_to_use],
    ispu_2024_std[cols_to_use],
    ispu_2025_std[cols_to_use]
], ignore_index=True)

# Hapus data dengan tanggal NaT atau stasiun_id None
ispu_combined = ispu_combined.dropna(subset=['tanggal', 'stasiun_id'])

# Sort by tanggal dan stasiun_id
ispu_combined = ispu_combined.sort_values(['tanggal', 'stasiun_id']).reset_index(drop=True)

print(f"Total data ISPU gabungan: {len(ispu_combined)}")
print(f"Range tanggal: {ispu_combined['tanggal'].min()} - {ispu_combined['tanggal'].max()}")
print(f"\nDistribusi kategori:")
print(ispu_combined['kategori'].value_counts())

Total data ISPU gabungan: 4870
Range tanggal: 2022-12-01 00:00:00 - 2025-08-31 00:00:00

Distribusi kategori:
kategori
SEDANG                3684
BAIK                   617
TIDAK SEHAT            526
TIDAK ADA DATA          39
SANGAT TIDAK SEHAT       4
Name: count, dtype: int64


In [None]:
# Clean data numerik - replace '-' dan '---' dengan NaN
numeric_cols = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 
                'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max']

for col in numeric_cols:
    ispu_combined[col] = pd.to_numeric(ispu_combined[col].replace(['-', '---', ''], np.nan), errors='coerce')

# Cek missing values
print("Missing values per kolom:")
print(ispu_combined[numeric_cols].isnull().sum())
print(f"\nTotal rows dengan kategori TIDAK ADA DATA: {(ispu_combined['kategori'] == 'TIDAK ADA DATA').sum()}")

Missing values per kolom:
pm_sepuluh           408
pm_duakomalima       348
sulfur_dioksida       92
karbon_monoksida      88
ozon                  72
nitrogen_dioksida    132
max                   10
dtype: int64

Total rows dengan kategori TIDAK ADA DATA: 39


## 2. Load dan Proses Data Cuaca Harian (Temperatur)

In [None]:
# Load data cuaca harian dari 5 stasiun
cuaca_files = {
    'DKI1': 'cuaca-harian-dki1-bundaranhi.csv',
    'DKI2': 'cuaca-harian-dki2-kelapagading.csv',
    'DKI3': 'cuaca-harian-dki3-jagakarsa.csv',
    'DKI4': 'cuaca-harian-dki4-lubangbuaya.csv',
    'DKI5': 'cuaca-harian-dki5-kebonjeruk.csv'
}

cuaca_dfs = []

for stasiun_id, filename in cuaca_files.items():
    df = pd.read_csv(f'{BASE_PATH}/cuaca-harian/{filename}')
    df['stasiun_id'] = stasiun_id
    df['tanggal'] = pd.to_datetime(df['time'])
    cuaca_dfs.append(df)

cuaca_combined = pd.concat(cuaca_dfs, ignore_index=True)

print(f"Total data cuaca: {len(cuaca_combined)}")
print(f"Range tanggal: {cuaca_combined['tanggal'].min()} - {cuaca_combined['tanggal'].max()}")
print(f"\nKolom tersedia:")
print(cuaca_combined.columns.tolist())

Total data cuaca: 28610
Range tanggal: 2010-01-01 00:00:00 - 2025-08-31 00:00:00

Kolom tersedia:
['time', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'precipitation_sum (mm)', 'precipitation_hours (h)', 'wind_speed_10m_max (km/h)', 'wind_direction_10m_dominant (°)', 'shortwave_radiation_sum (MJ/m²)', 'temperature_2m_mean (°C)', 'relative_humidity_2m_mean (%)', 'cloud_cover_mean (%)', 'surface_pressure_mean (hPa)', 'wind_gusts_10m_max (km/h)', 'winddirection_10m_dominant (°)', 'relative_humidity_2m_max (%)', 'relative_humidity_2m_min (%)', 'cloud_cover_max (%)', 'cloud_cover_min (%)', 'wind_gusts_10m_mean (km/h)', 'wind_speed_10m_mean (km/h)', 'wind_gusts_10m_min (km/h)', 'wind_speed_10m_min (km/h)', 'surface_pressure_max (hPa)', 'surface_pressure_min (hPa)', 'stasiun_id', 'tanggal']


In [None]:
# Filter cuaca untuk tahun 2023-2025 dan pilih fitur temperatur (paling berkorelasi)
cuaca_filtered = cuaca_combined[cuaca_combined['tanggal'].dt.year.isin([2023, 2024, 2025])].copy()

# Pilih kolom temperatur yang relevan
temp_cols = ['temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'temperature_2m_mean (°C)']

# Rename kolom untuk memudahkan
cuaca_filtered = cuaca_filtered.rename(columns={
    'temperature_2m_max (°C)': 'temp_max',
    'temperature_2m_min (°C)': 'temp_min',
    'temperature_2m_mean (°C)': 'temp_mean'
})

# Pilih kolom yang diperlukan
cuaca_final = cuaca_filtered[['tanggal', 'stasiun_id', 'temp_max', 'temp_min', 'temp_mean']].copy()

print(f"Data cuaca 2023-2025: {len(cuaca_final)} rows")
print(f"\nStatistik temperatur:")
print(cuaca_final[['temp_max', 'temp_min', 'temp_mean']].describe())

Data cuaca 2023-2025: 4870 rows

Statistik temperatur:
          temp_max     temp_min    temp_mean
count  4870.000000  4870.000000  4870.000000
mean     31.826427    23.761971    27.127248
std       1.956923     0.884619     1.040020
min      24.800000    19.200000    23.600000
25%      30.600000    23.200000    26.400000
50%      31.800000    23.800000    27.100000
75%      33.000000    24.400000    27.800000
max      38.800000    26.400000    30.500000


## 4. Load dan Proses Data NDVI (Vegetation Index)

In [None]:
# Load data NDVI
ndvi = pd.read_csv(f'{BASE_PATH}/NDVI (vegetation index)/indeks-ndvi-jakarta.csv')
ndvi['tanggal'] = pd.to_datetime(ndvi['tanggal'])

# Filter untuk tahun 2023-2025
ndvi_filtered = ndvi[ndvi['tanggal'].dt.year.isin([2023, 2024, 2025])].copy()

print(f"Data NDVI 2023-2025: {len(ndvi_filtered)} rows")
print(f"Range tanggal: {ndvi_filtered['tanggal'].min()} - {ndvi_filtered['tanggal'].max()}")
print(f"\nStasiun unik: {ndvi_filtered['stasiun_id'].unique()}")
print(f"\nStatistik NDVI:")
print(ndvi_filtered['ndvi'].describe())

Data NDVI 2023-2025: 310 rows
Range tanggal: 2023-01-01 00:00:00 - 2025-08-29 00:00:00

Stasiun unik: <StringArray>
['DKI5', 'DKI1', 'DKI3', 'DKI4', 'DKI2']
Length: 5, dtype: str

Statistik NDVI:
count    310.000000
mean       0.330309
std        0.140193
min        0.031100
25%        0.225450
50%        0.327150
75%        0.450300
max        0.652600
Name: ndvi, dtype: float64


In [None]:
# NDVI direkam setiap ~16 hari, perlu di-interpolasi untuk setiap hari
# Buat NDVI untuk setiap stasiun dan interpolasi ke semua tanggal

def interpolate_ndvi(ndvi_df, date_range):
    """
    Interpolasi nilai NDVI untuk semua tanggal dalam range.
    NDVI direkam setiap ~16 hari, jadi perlu forward fill dan interpolasi.
    """
    result_dfs = []
    
    for stasiun in ndvi_df['stasiun_id'].unique():
        stasiun_data = ndvi_df[ndvi_df['stasiun_id'] == stasiun].copy()
        
        # Buat dataframe dengan semua tanggal
        all_dates = pd.DataFrame({'tanggal': date_range})
        all_dates['stasiun_id'] = stasiun
        
        # Merge dengan data NDVI
        merged = all_dates.merge(stasiun_data[['tanggal', 'ndvi']], on='tanggal', how='left')
        
        # Interpolasi linear untuk nilai yang hilang
        merged['ndvi'] = merged['ndvi'].interpolate(method='linear')
        
        # Forward fill untuk nilai awal yang mungkin NaN
        merged['ndvi'] = merged['ndvi'].ffill()
        
        # Backward fill untuk nilai akhir yang mungkin NaN
        merged['ndvi'] = merged['ndvi'].bfill()
        
        result_dfs.append(merged)
    
    return pd.concat(result_dfs, ignore_index=True)

# Buat date range untuk 2023-2025
date_range = pd.date_range(start='2023-01-01', end='2025-12-31', freq='D')

# Interpolasi NDVI
ndvi_interpolated = interpolate_ndvi(ndvi_filtered, date_range)

print(f"NDVI interpolated: {len(ndvi_interpolated)} rows")
print(f"Stasiun unik: {ndvi_interpolated['stasiun_id'].unique()}")
print(ndvi_interpolated.head(20))

NDVI interpolated: 5480 rows
Stasiun unik: <StringArray>
['DKI5', 'DKI1', 'DKI3', 'DKI4', 'DKI2']
Length: 5, dtype: str
      tanggal stasiun_id      ndvi
0  2023-01-01       DKI5  0.435800
1  2023-01-02       DKI5  0.411081
2  2023-01-03       DKI5  0.386362
3  2023-01-04       DKI5  0.361644
4  2023-01-05       DKI5  0.336925
5  2023-01-06       DKI5  0.312206
6  2023-01-07       DKI5  0.287488
7  2023-01-08       DKI5  0.262769
8  2023-01-09       DKI5  0.238050
9  2023-01-10       DKI5  0.213331
10 2023-01-11       DKI5  0.188613
11 2023-01-12       DKI5  0.163894
12 2023-01-13       DKI5  0.139175
13 2023-01-14       DKI5  0.114456
14 2023-01-15       DKI5  0.089738
15 2023-01-16       DKI5  0.065019
16 2023-01-17       DKI5  0.040300
17 2023-01-18       DKI5  0.044825
18 2023-01-19       DKI5  0.049350
19 2023-01-20       DKI5  0.053875


## 5. Gabungkan Semua Data

In [None]:
# Gabungkan ISPU dengan data cuaca (berdasarkan tanggal dan stasiun)
df_merged = ispu_combined.merge(
    cuaca_final, 
    on=['tanggal', 'stasiun_id'], 
    how='left'
)

print(f"Setelah merge dengan cuaca: {len(df_merged)} rows")
print(f"Missing temp_mean: {df_merged['temp_mean'].isna().sum()}")

Setelah merge dengan cuaca: 4870 rows
Missing temp_mean: 155


In [None]:
# Gabungkan dengan data NDVI (berdasarkan tanggal dan stasiun)
df_merged = df_merged.merge(
    ndvi_interpolated, 
    on=['tanggal', 'stasiun_id'], 
    how='left'
)

print(f"Setelah merge dengan NDVI: {len(df_merged)} rows")
print(f"Missing NDVI: {df_merged['ndvi'].isna().sum()}")

Setelah merge dengan NDVI: 4870 rows
Missing NDVI: 155


In [None]:
# Lihat hasil penggabungan
print(f"Total data gabungan: {len(df_merged)} rows")
print(f"\nKolom yang tersedia:")
print(df_merged.columns.tolist())

print(f"\n\nSample data:")
df_merged.head(10)

Total data gabungan: 4870 rows

Kolom yang tersedia:
['tanggal', 'stasiun_id', 'pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori', 'temp_max', 'temp_min', 'temp_mean', 'is_holiday_nasional', 'is_weekend', 'is_libur_or_weekend', 'day_of_week', 'days_after_eid', 'is_eid_period', 'is_post_eid_clean', 'ndvi']


Sample data:


Unnamed: 0,tanggal,stasiun_id,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,temp_max,temp_min,temp_mean,is_holiday_nasional,is_weekend,is_libur_or_weekend,day_of_week,days_after_eid,is_eid_period,is_post_eid_clean,ndvi
0,2022-12-01,DKI1,54.0,73.0,36.0,12.0,22.0,13.0,73.0,PM25,SEDANG,,,,,,,,,,,
1,2022-12-01,DKI2,59.0,74.0,46.0,15.0,50.0,31.0,74.0,PM25,SEDANG,,,,,,,,,,,
2,2022-12-01,DKI3,53.0,81.0,42.0,9.0,24.0,12.0,81.0,PM25,SEDANG,,,,,,,,,,,
3,2022-12-01,DKI4,64.0,93.0,52.0,7.0,27.0,19.0,93.0,PM25,SEDANG,,,,,,,,,,,
4,2022-12-01,DKI5,52.0,,18.0,17.0,27.0,5.0,52.0,PM10,SEDANG,,,,,,,,,,,
5,2022-12-02,DKI1,53.0,67.0,39.0,11.0,14.0,12.0,67.0,PM25,SEDANG,,,,,,,,,,,
6,2022-12-02,DKI2,55.0,67.0,51.0,14.0,42.0,30.0,67.0,PM25,SEDANG,,,,,,,,,,,
7,2022-12-02,DKI3,55.0,92.0,43.0,11.0,24.0,13.0,92.0,PM25,SEDANG,,,,,,,,,,,
8,2022-12-02,DKI4,60.0,94.0,52.0,9.0,20.0,18.0,94.0,PM25,SEDANG,,,,,,,,,,,
9,2022-12-02,DKI5,48.0,,20.0,18.0,21.0,5.0,48.0,PM10,BAIK,,,,,,,,,,,


## 6. Feature Engineering

In [None]:
# Buat fitur tambahan dari tanggal
df_merged['year'] = df_merged['tanggal'].dt.year
df_merged['month'] = df_merged['tanggal'].dt.month
df_merged['day'] = df_merged['tanggal'].dt.day
df_merged['day_of_year'] = df_merged['tanggal'].dt.dayofyear

# Fitur musim (berdasarkan cuaca di Jakarta: musim hujan dan kemarau)
# Musim hujan: Oktober - Maret
# Musim kemarau: April - September
df_merged['is_rainy_season'] = df_merged['month'].apply(
    lambda x: 1 if x in [10, 11, 12, 1, 2, 3] else 0
)

# Fitur temperatur range (selisih max dan min)
df_merged['temp_range'] = df_merged['temp_max'] - df_merged['temp_min']

print("Fitur baru yang ditambahkan:")
print(df_merged[['tanggal', 'year', 'month', 'day', 'day_of_year', 'is_rainy_season', 'temp_range']].head(10))

Fitur baru yang ditambahkan:
     tanggal  year  month  day  day_of_year  is_rainy_season  temp_range
0 2022-12-01  2022     12    1          335                1         NaN
1 2022-12-01  2022     12    1          335                1         NaN
2 2022-12-01  2022     12    1          335                1         NaN
3 2022-12-01  2022     12    1          335                1         NaN
4 2022-12-01  2022     12    1          335                1         NaN
5 2022-12-02  2022     12    2          336                1         NaN
6 2022-12-02  2022     12    2          336                1         NaN
7 2022-12-02  2022     12    2          336                1         NaN
8 2022-12-02  2022     12    2          336                1         NaN
9 2022-12-02  2022     12    2          336                1         NaN


In [None]:
# Encode stasiun_id ke numerik
stasiun_mapping = {
    'DKI1': 1,
    'DKI2': 2,
    'DKI3': 3,
    'DKI4': 4,
    'DKI5': 5
}
df_merged['stasiun_encoded'] = df_merged['stasiun_id'].map(stasiun_mapping)

# Encode kategori ke numerik untuk target
kategori_mapping = {
    'BAIK': 0,
    'SEDANG': 1,
    'TIDAK SEHAT': 2,
    'SANGAT TIDAK SEHAT': 3,
    'BERBAHAYA': 4,
    'TIDAK ADA DATA': -1
}
df_merged['kategori_encoded'] = df_merged['kategori'].map(kategori_mapping)

print("Mapping kategori:")
for k, v in kategori_mapping.items():
    print(f"  {k}: {v}")

print(f"\nDistribusi kategori_encoded:")
print(df_merged['kategori_encoded'].value_counts().sort_index())

Mapping kategori:
  BAIK: 0
  SEDANG: 1
  TIDAK SEHAT: 2
  SANGAT TIDAK SEHAT: 3
  BERBAHAYA: 4
  TIDAK ADA DATA: -1

Distribusi kategori_encoded:
kategori_encoded
-1      39
 0     617
 1    3684
 2     526
 3       4
Name: count, dtype: int64


In [None]:
# Buat fitur lag (nilai ISPU hari sebelumnya) - untuk setiap stasiun
# Sort terlebih dahulu
df_merged = df_merged.sort_values(['stasiun_id', 'tanggal']).reset_index(drop=True)

# Lag features untuk max value
for lag in [1, 2, 3, 7]:
    df_merged[f'max_lag_{lag}'] = df_merged.groupby('stasiun_id')['max'].shift(lag)

# Rolling mean untuk max value (7 hari terakhir)
df_merged['max_rolling_7d_mean'] = df_merged.groupby('stasiun_id')['max'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean()
)

# Rolling std untuk max value (7 hari terakhir)
df_merged['max_rolling_7d_std'] = df_merged.groupby('stasiun_id')['max'].transform(
    lambda x: x.rolling(window=7, min_periods=1).std()
)

print("Lag features added:")
print(df_merged[['tanggal', 'stasiun_id', 'max', 'max_lag_1', 'max_lag_2', 'max_lag_3', 'max_lag_7', 'max_rolling_7d_mean']].head(15))

Lag features added:
      tanggal stasiun_id   max  max_lag_1  max_lag_2  max_lag_3  max_lag_7  \
0  2022-12-01       DKI1  73.0        NaN        NaN        NaN        NaN   
1  2022-12-02       DKI1  67.0       73.0        NaN        NaN        NaN   
2  2022-12-03       DKI1  76.0       67.0       73.0        NaN        NaN   
3  2022-12-04       DKI1  76.0       76.0       67.0       73.0        NaN   
4  2022-12-05       DKI1  74.0       76.0       76.0       67.0        NaN   
5  2022-12-06       DKI1  79.0       74.0       76.0       76.0        NaN   
6  2022-12-07       DKI1  68.0       79.0       74.0       76.0        NaN   
7  2022-12-08       DKI1  62.0       68.0       79.0       74.0       73.0   
8  2022-12-09       DKI1  73.0       62.0       68.0       79.0       67.0   
9  2022-12-10       DKI1  42.0       73.0       62.0       68.0       76.0   
10 2022-12-11       DKI1  61.0       42.0       73.0       62.0       76.0   
11 2022-12-12       DKI1  43.0       61.0   

## 7. Handle Missing Values

In [None]:
# Cek missing values
print("Missing values per kolom:")
missing_info = df_merged.isnull().sum()
missing_pct = (df_merged.isnull().sum() / len(df_merged) * 100).round(2)
missing_df = pd.DataFrame({'missing_count': missing_info, 'missing_pct': missing_pct})
print(missing_df[missing_df['missing_count'] > 0])

Missing values per kolom:
                           missing_count  missing_pct
pm_sepuluh                           408         8.38
pm_duakomalima                       348         7.15
sulfur_dioksida                       92         1.89
karbon_monoksida                      88         1.81
ozon                                  72         1.48
nitrogen_dioksida                    132         2.71
max                                   10         0.21
parameter_pencemar_kritis             75         1.54
temp_max                             155         3.18
temp_min                             155         3.18
temp_mean                            155         3.18
is_holiday_nasional                  155         3.18
is_weekend                           155         3.18
is_libur_or_weekend                  155         3.18
day_of_week                          155         3.18
days_after_eid                       155         3.18
is_eid_period                        155         3.18
is

In [None]:
# Handle missing values dengan strategi yang sesuai

# 1. Hapus baris dengan kategori TIDAK ADA DATA (tidak bisa digunakan untuk training)
df_clean = df_merged[df_merged['kategori'] != 'TIDAK ADA DATA'].copy()
print(f"Setelah hapus TIDAK ADA DATA: {len(df_clean)} rows (dari {len(df_merged)})")

# 2. Fill missing numerik dengan median per stasiun
numeric_cols_to_fill = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 
                        'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max',
                        'temp_max', 'temp_min', 'temp_mean', 'temp_range', 'ndvi']

for col in numeric_cols_to_fill:
    if col in df_clean.columns:
        df_clean[col] = df_clean.groupby('stasiun_id')[col].transform(
            lambda x: x.fillna(x.median())
        )

# 3. Fill lag features dengan forward fill dalam grup stasiun
lag_cols = ['max_lag_1', 'max_lag_2', 'max_lag_3', 'max_lag_7', 'max_rolling_7d_mean', 'max_rolling_7d_std']
for col in lag_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean.groupby('stasiun_id')[col].transform(
            lambda x: x.ffill().bfill()
        )

# 4. Fill remaining NaN dengan global median
for col in df_clean.select_dtypes(include=[np.number]).columns:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

print(f"\nMissing values setelah handling:")
print(df_clean.isnull().sum().sum())

Setelah hapus TIDAK ADA DATA: 4831 rows (dari 4870)

Missing values setelah handling:
44


## 8. Finalisasi Dataset

In [None]:
# Pilih fitur yang akan digunakan untuk model
feature_cols = [
    # Fitur ISPU (numerik)
    'pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 
    'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max',
    
    # Fitur Cuaca (temperatur - paling berkorelasi)
    'temp_max', 'temp_min', 'temp_mean', 'temp_range',
    
    # Fitur Waktu
    'year', 'month', 'day', 'day_of_year', 'day_of_week', 'is_rainy_season',
    
    # Fitur NDVI
    'ndvi',
    
    # Fitur Stasiun
    'stasiun_encoded',
    
    # Fitur Lag dan Rolling
    'max_lag_1', 'max_lag_2', 'max_lag_3', 'max_lag_7',
    'max_rolling_7d_mean', 'max_rolling_7d_std'
]

# Target
target_col = 'kategori_encoded'

print("Fitur yang digunakan:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}")

print(f"\nTarget: {target_col}")

Fitur yang digunakan:
  1. pm_sepuluh
  2. pm_duakomalima
  3. sulfur_dioksida
  4. karbon_monoksida
  5. ozon
  6. nitrogen_dioksida
  7. max
  8. temp_max
  9. temp_min
  10. temp_mean
  11. temp_range
  12. year
  13. month
  14. day
  15. day_of_year
  16. day_of_week
  17. is_rainy_season
  18. is_holiday_nasional
  19. is_weekend
  20. is_libur_or_weekend
  21. days_after_eid
  22. is_eid_period
  23. is_post_eid_clean
  24. ndvi
  25. stasiun_encoded
  26. max_lag_1
  27. max_lag_2
  28. max_lag_3
  29. max_lag_7
  30. max_rolling_7d_mean
  31. max_rolling_7d_std

Target: kategori_encoded


In [None]:
# Buat dataset final
df_final = df_clean[['tanggal', 'stasiun_id', 'kategori'] + feature_cols + [target_col]].copy()

# Sort by tanggal dan stasiun_id
df_final = df_final.sort_values(['tanggal', 'stasiun_id']).reset_index(drop=True)

print(f"Dataset final: {len(df_final)} rows x {len(df_final.columns)} columns")
print(f"\nRange tanggal: {df_final['tanggal'].min()} - {df_final['tanggal'].max()}")
print(f"\nDistribusi kategori:")
print(df_final['kategori'].value_counts())

df_final.head(10)

Dataset final: 4831 rows x 35 columns

Range tanggal: 2022-12-01 00:00:00 - 2025-08-31 00:00:00

Distribusi kategori:
kategori
SEDANG                3684
BAIK                   617
TIDAK SEHAT            526
SANGAT TIDAK SEHAT       4
Name: count, dtype: int64


Unnamed: 0,tanggal,stasiun_id,kategori,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,temp_max,temp_min,temp_mean,temp_range,year,month,day,day_of_year,day_of_week,is_rainy_season,is_holiday_nasional,is_weekend,is_libur_or_weekend,days_after_eid,is_eid_period,is_post_eid_clean,ndvi,stasiun_encoded,max_lag_1,max_lag_2,max_lag_3,max_lag_7,max_rolling_7d_mean,max_rolling_7d_std,kategori_encoded
0,2022-12-01,DKI1,SEDANG,54.0,73.0,36.0,12.0,22.0,13.0,73.0,32.0,24.0,27.3,7.9,2022,12,1,335,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.229287,1,73.0,73.0,73.0,73.0,73.0,4.242641,1
1,2022-12-01,DKI2,SEDANG,59.0,74.0,46.0,15.0,50.0,31.0,74.0,31.2,24.4,27.35,6.8,2022,12,1,335,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.216034,2,74.0,74.0,74.0,74.0,74.0,4.949747,1
2,2022-12-01,DKI3,SEDANG,53.0,81.0,42.0,9.0,24.0,12.0,81.0,31.6,23.2,26.6,8.4,2022,12,1,335,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.444569,3,81.0,81.0,81.0,81.0,81.0,7.778175,1
3,2022-12-01,DKI4,SEDANG,64.0,93.0,52.0,7.0,27.0,19.0,93.0,31.8,23.5,26.9,8.3,2022,12,1,335,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.490013,4,93.0,93.0,93.0,93.0,93.0,0.707107,1
4,2022-12-01,DKI5,SEDANG,52.0,78.0,18.0,17.0,27.0,5.0,52.0,32.0,23.9,27.3,8.0,2022,12,1,335,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.355784,5,52.0,52.0,52.0,52.0,52.0,2.828427,1
5,2022-12-02,DKI1,SEDANG,53.0,67.0,39.0,11.0,14.0,12.0,67.0,32.0,24.0,27.3,7.9,2022,12,2,336,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.229287,1,73.0,73.0,73.0,73.0,70.0,4.242641,1
6,2022-12-02,DKI2,SEDANG,55.0,67.0,51.0,14.0,42.0,30.0,67.0,31.2,24.4,27.35,6.8,2022,12,2,336,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.216034,2,74.0,74.0,74.0,74.0,70.5,4.949747,1
7,2022-12-02,DKI3,SEDANG,55.0,92.0,43.0,11.0,24.0,13.0,92.0,31.6,23.2,26.6,8.4,2022,12,2,336,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.444569,3,81.0,81.0,81.0,81.0,86.5,7.778175,1
8,2022-12-02,DKI4,SEDANG,60.0,94.0,52.0,9.0,20.0,18.0,94.0,31.8,23.5,26.9,8.3,2022,12,2,336,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.490013,4,93.0,93.0,93.0,93.0,93.5,0.707107,1
9,2022-12-02,DKI5,BAIK,48.0,78.0,20.0,18.0,21.0,5.0,48.0,32.0,23.9,27.3,8.0,2022,12,2,336,3.0,1,0.0,0.0,0.0,-1.0,0.0,0.0,0.355784,5,52.0,52.0,52.0,52.0,50.0,2.828427,0


In [None]:
# Statistik deskriptif fitur
print("Statistik deskriptif fitur numerik:")
df_final[feature_cols].describe().round(2)

Statistik deskriptif fitur numerik:


Unnamed: 0,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,temp_max,temp_min,temp_mean,temp_range,year,month,day,day_of_year,day_of_week,is_rainy_season,is_holiday_nasional,is_weekend,is_libur_or_weekend,days_after_eid,is_eid_period,is_post_eid_clean,ndvi,stasiun_encoded,max_lag_1,max_lag_2,max_lag_3,max_lag_7,max_rolling_7d_mean,max_rolling_7d_std
count,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0,4831.0
mean,50.91,73.89,36.64,14.36,25.71,23.33,73.9,31.8,23.76,27.11,8.04,2023.84,6.03,15.73,168.16,3.0,0.47,0.05,0.28,0.31,-0.86,0.04,0.02,0.33,3.0,73.53,73.45,73.48,73.49,73.55,12.22
std,15.95,23.16,15.4,6.95,13.36,14.88,22.31,1.93,0.88,1.03,2.04,0.83,3.32,8.8,101.48,1.97,0.5,0.22,0.45,0.46,0.9,0.19,0.15,0.13,1.41,22.84,22.69,22.87,22.92,18.54,7.42
min,3.0,10.0,3.0,1.0,2.0,0.0,14.0,24.8,19.2,23.6,1.5,2022.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.03,1.0,0.0,0.0,0.0,0.0,16.0,0.0
25%,40.0,59.0,24.0,9.0,16.0,13.0,58.0,30.6,23.2,26.4,6.8,2023.0,3.0,8.0,82.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.22,2.0,57.5,58.0,57.0,57.5,60.0,7.7
50%,53.0,76.0,34.0,13.0,23.0,20.0,73.0,31.8,23.8,27.1,7.9,2024.0,6.0,16.0,163.0,3.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.32,3.0,73.0,73.0,73.0,73.0,74.14,10.88
75%,61.0,88.0,52.0,19.0,32.0,31.0,88.0,32.9,24.4,27.8,9.25,2024.0,9.0,23.0,244.0,5.0,1.0,0.0,1.0,1.0,-1.0,0.0,0.0,0.44,4.0,88.0,88.0,88.0,88.0,86.29,15.02
max,187.0,287.0,112.0,70.0,115.0,202.0,287.0,38.8,26.4,30.5,15.1,2025.0,12.0,31.0,366.0,6.0,1.0,1.0,1.0,1.0,7.0,1.0,1.0,0.65,5.0,287.0,249.0,287.0,287.0,173.57,96.34


## 9. Simpan Dataset Preprocessed

In [None]:
# Buat folder output jika belum ada
output_dir = '../data_processed'
os.makedirs(output_dir, exist_ok=True)

# Simpan dataset final
output_path = f'{output_dir}/ispu_preprocessed_2023_2025.csv'
df_final.to_csv(output_path, index=False)
print(f"Dataset disimpan ke: {output_path}")

# Simpan juga versi tanpa tanggal untuk training langsung
df_train_ready = df_final[feature_cols + [target_col]].copy()
train_output_path = f'{output_dir}/ispu_train_ready.csv'
df_train_ready.to_csv(train_output_path, index=False)
print(f"Dataset training disimpan ke: {train_output_path}")

Dataset disimpan ke: ../data_processed/ispu_preprocessed_2023_2025.csv
Dataset training disimpan ke: ../data_processed/ispu_train_ready.csv


In [None]:
# Simpan mapping untuk referensi
mappings = {
    'kategori_mapping': kategori_mapping,
    'stasiun_mapping': stasiun_mapping,
    'feature_cols': feature_cols,
    'target_col': target_col
}

import json
mapping_path = f'{output_dir}/mappings.json'
with open(mapping_path, 'w') as f:
    json.dump(mappings, f, indent=2)
print(f"Mappings disimpan ke: {mapping_path}")

Mappings disimpan ke: ../data_processed/mappings.json


## 10. Ringkasan Preprocessing

### Data yang Digunakan:
1. **ISPU 2023-2025**: Data indeks standar pencemar udara dengan komponen PM10, PM2.5, SO2, CO, O3, NO2
2. **Cuaca Harian**: Fokus pada temperatur (max, min, mean, range) karena paling berkorelasi
3. **NDVI**: Indeks vegetasi yang di-interpolasi

### Fitur yang Dibuat:
- **Fitur Temporal**: year, month, day, day_of_year, day_of_week, is_rainy_season
- **Fitur Cuaca**: temp_max, temp_min, temp_mean, temp_range
- **Fitur Vegetasi**: ndvi
- **Fitur Lag/Rolling**: max_lag_1/2/3/7, max_rolling_7d_mean, max_rolling_7d_std

### Target:
- kategori_encoded: 0=BAIK, 1=SEDANG, 2=TIDAK SEHAT, 3=SANGAT TIDAK SEHAT, 4=BERBAHAYA

In [None]:
# Ringkasan final
print("=" * 60)
print("RINGKASAN PREPROCESSING")
print("=" * 60)
print(f"\n📊 Total data: {len(df_final)} rows")
print(f"📅 Range tanggal: {df_final['tanggal'].min().strftime('%Y-%m-%d')} - {df_final['tanggal'].max().strftime('%Y-%m-%d')}")
print(f"🏢 Jumlah stasiun: {df_final['stasiun_id'].nunique()}")
print(f"📈 Jumlah fitur: {len(feature_cols)}")

print(f"\n🎯 Distribusi Kategori:")
for cat, count in df_final['kategori'].value_counts().items():
    pct = count / len(df_final) * 100
    print(f"   {cat}: {count} ({pct:.1f}%)")

print(f"\n📁 File output:")
print(f"   - {output_path}")
print(f"   - {train_output_path}")
print(f"   - {mapping_path}")
print("=" * 60)

RINGKASAN PREPROCESSING

📊 Total data: 4831 rows
📅 Range tanggal: 2022-12-01 - 2025-08-31
🏢 Jumlah stasiun: 5
📈 Jumlah fitur: 31

🎯 Distribusi Kategori:
   SEDANG: 3684 (76.3%)
   BAIK: 617 (12.8%)
   TIDAK SEHAT: 526 (10.9%)
   SANGAT TIDAK SEHAT: 4 (0.1%)

📁 File output:
   - ../data_processed/ispu_preprocessed_2023_2025.csv
   - ../data_processed/ispu_train_ready.csv
   - ../data_processed/mappings.json
