In [None]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import numpy as np
import matplotlib.pyplot as plt
import time
import warnings
import joblib
import os
import io
import contextlib

from sklearn.model_selection import train_test_split, cross_val_score, KFold, TimeSeriesSplit
from sklearn.metrics import  f1_score, mean_squared_error, r2_score, make_scorer, accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr
from xgboost.callback import EarlyStopping
from xgboost import XGBClassifier
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
warnings.filterwarnings('ignore')

In [None]:
SEED = 42
N_DAYS = 10
PAST_DAYS = 5

In [None]:
#Change these two to change models
pm25 = False
improvement = True

In [None]:
folder_prefix = '../Models/saved_models'
folder_suffix = [
    'yes_pm25' if pm25 else 'no_pm25',
    'yes_improvement' if improvement else 'no_improvement'
]
models_folder = f"{folder_prefix}_{'_'.join(folder_suffix)}"

models_folder

In [None]:
data = pd.read_csv('../Inputs/data_onkk.csv')
population_data = pd.read_excel('../Inputs/population_data.xlsx')

In [None]:
data = pd.merge(data, population_data, on='SID', how='inner')

In [None]:
data['time'] = pd.to_datetime(data['time'], format='%m/%d/%Y')
data['day'] = data['time'].dt.day
data['month'] = data['time'].dt.month
data['year'] = data['time'].dt.year

data = data.sort_values(by=['SID', 'time'])

In [None]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

def feature_engineering(df):
    if not improvement:
        return df

    df['season'] = df['month'].apply(get_season)
    df['diffusion_conditions'] = df['WSPD'] * df['TP']
    
    return df

data = feature_engineering(data)

In [None]:
lag_features = ['WSPD', 'WDIR', 'TMP', 'TX', 'TN', 'TP', 'RH', 'PRES2M']

if improvement:
    lag_features += ['diffusion_conditions']

if pm25:
    lag_features += ['pm25']

for lag in range(1, PAST_DAYS + 1):
    data[f'time_lag_{lag}'] = data['time'] - pd.Timedelta(days=lag)
    for feature in lag_features: 
        data = data.merge(
            data[['SID', 'time', feature]].rename(columns={'time': f'time_lag_{lag}', feature: f'{feature}_lag_{lag}'}),
            on=['SID', f'time_lag_{lag}'], how='left'
        )

    data = data.drop(columns=[f'time_lag_{lag}'])

In [None]:
for day in range(0, N_DAYS):
    data[f'time_target_{day}'] = data['time'] + pd.Timedelta(days=day)
    data = data.merge(
        data[['SID', 'time', 'pm25']].rename(columns={'time': f'time_target_{day}', 'pm25': f'pm25_target_{day}'}),
        on=['SID', f'time_target_{day}'], how='left'
    )
    data = data.drop(columns=[f'time_target_{day}'])

In [None]:
data = data.dropna(subset=[f'pm25_target_{day}' for day in range(0, N_DAYS)])

In [None]:
def pm25_to_aqi(pm25):
    bp = [
        (0, 25, 0, 50),
        (25, 50, 51, 100),
        (50, 80, 101, 150),
        (80, 150, 151, 200),
        (150, 250, 201, 300),
        (250, 350, 301, 400),
        (350, 500, 401, 500),
        (500, float('inf'), 501, 500) 
    ]
    for (bp_low, bp_high, i_low, i_high) in bp:
        if bp_low <= pm25 < bp_high:
            aqi = ((i_high - i_low) / (bp_high - bp_low)) * (pm25 - bp_low) + i_low
            return min(round(aqi), i_high) 
    return 500

def aqi_category(aqi):
    if aqi <= 50:
        return 0 
    elif 51 <= aqi <= 100:
        return 1 
    elif 101 <= aqi <= 150:
        return 2 
    elif 151 <= aqi <= 200:
        return 3 
    elif 201 <= aqi <= 300:
        return 4 
    else:
        return 5

vectorized_pm25_to_aqi = np.vectorize(pm25_to_aqi)
vectorized_aqi_category = np.vectorize(aqi_category)

In [None]:
for day in range(0, N_DAYS):
    feature = f'pm25_target_{day}'
    target = f'AQI_cat_target_{day}'
    data[target] = vectorized_pm25_to_aqi(data[feature])
    data[target] = vectorized_aqi_category(data[target])

In [None]:
features = ([
    'SQRT_SEA_DEM_LAT'] +
    [f'{feature}_lag_{i}' for feature in lag_features for i in range(1, PAST_DAYS + 1)]
)

if improvement:
    features += ['urbanization_rate', 'population_density', 'season']

targets = [f'AQI_cat_target_{day}' for day in range(0, N_DAYS)]

X = data[features]
y = data[targets]

In [None]:
test_time = (data['time'] >= '2021-06-01') & (data['time'] <= '2021-12-31')

X_test = X.loc[test_time]
y_test = y.loc[test_time]

X_train_full = X.loc[~test_time]
y_train_full = y.loc[~test_time]

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.25,
    random_state=SEED,
    shuffle=True
)

total_samples = X.shape[0]

splits = {
    "Train": X_train,
    "Validation": X_valid,
    "Test": X_test
}

for name, X_split in splits.items():
    n = X_split.shape[0]
    pct = n / total_samples * 100
    print(f"{name} samples: {n} ({pct:.2f}% of total)")

In [None]:
categorical_columns = X_train.select_dtypes(include=['object']).columns

In [None]:
label_encoder = LabelEncoder()

for col in categorical_columns:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_valid[col] = label_encoder.transform(X_valid[col])
    X_test[col] = label_encoder.transform(X_test[col])

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_valid = pd.DataFrame(X_valid, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
XGB_Params = {
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 10000,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  
    'reg_lambda': 5,
    'random_state': SEED,
    'tree_method': 'hist',
    'device': 'cuda'
}

LGBM_Params = {
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 10000,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda_l1': 10,  
    'lambda_l2': 0.1, 
    'random_state': SEED,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose': -1
}

CatBoost_Params = {
    'learning_rate': 0.1,
    'depth': 8,
    'iterations': 10000,
    'random_seed': SEED,
    'task_type': 'GPU',
    'devices': '0', 
    'verbose': False
}

In [None]:
XGB_Params.update({ 'eval_metric': 'mlogloss' })
LGBM_Params.update({ 'verbosity': -1, 'eval_metric': 'multi_logloss' })
CatBoost_Params.update({ 'eval_metric': 'MultiClass' })

early_stopping_rounds = 100
models_dict = {}

for alg in ['CatBoost', 'XGB', 'LGBM']:
    print(f"\nTraining models for algorithm: {alg}")
    models_per_target = {}
    
    for target in targets:
        print(f"  Training {alg} model for target: {target}")
        
        if alg == 'CatBoost':
            model = cb.CatBoostClassifier(**CatBoost_Params)
            model.fit(
                X_train, y_train[target],
                eval_set=(X_valid, y_valid[target]),
                early_stopping_rounds=early_stopping_rounds,
                use_best_model=True,
                verbose=False
            )
            models_per_target[target] = model
        
        elif alg == 'XGB':
            params    = XGB_Params.copy()
            num_round = params.pop('n_estimators')
            
            params.update({
                'objective': 'multi:softmax',
                'num_class': len(np.unique(y_train[target]))
            })
            
            dtrain = xgb.DMatrix(X_train, label=y_train[target])
            dvalid = xgb.DMatrix(X_valid,  label=y_valid[target])
            
            booster = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=num_round,
                evals=[(dtrain, 'train'), (dvalid, 'valid')],
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=False
            )
            models_per_target[target] = booster
        
        else:  # LGBM
            model = lgb.LGBMClassifier(**LGBM_Params)
            with contextlib.redirect_stdout(io.StringIO()):
                model.fit(
                    X_train,
                    y_train[target],
                    eval_set=[(X_valid, y_valid[target])],
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=early_stopping_rounds),
                        lgb.log_evaluation(period=0)
                    ]
                )
            models_per_target[target] = model
    
    models_dict[alg] = models_per_target

In [None]:
class MultiOutputWrapper:
    def __init__(self, models, target_list):
        self.models = models  
        self.targets = target_list

    def predict(self, X):
        preds = []
        for target in self.targets:
            model = self.models[target]
            if isinstance(model, xgb.Booster):
                dX = xgb.DMatrix(X)
                p = model.predict(dX)
            else:
                p = model.predict(X)
            preds.append(p)
        return np.column_stack(preds)

In [None]:
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    y_true = np.array(y)
    y_pred = np.array(y_pred)

    for i, target_name in enumerate(model.targets):
        yt = y_true[:, i]
        yp = y_pred[:, i]

        acc  = accuracy_score(yt, yp)
        prec = precision_score(yt, yp, average='weighted', zero_division=0)
        rec  = recall_score(yt, yp, average='weighted', zero_division=0)

        print(f"{dataset_name} - {target_name}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

In [None]:
for alg, models in models_dict.items():
    wrapped = MultiOutputWrapper(models, targets)
    print(f"\nEvaluating {alg} on train set")
    evaluate_model(wrapped, X_train, y_train, 'Training')
    print(f"\nEvaluating {alg} on valid set")
    evaluate_model(wrapped, X_valid, y_valid, 'Validation')

In [None]:
xgb_preds_valid = np.column_stack([
    models_dict['XGB'][target].predict(xgb.DMatrix(X_valid))
    for target in targets
])

lgbm_preds_valid = np.column_stack([
    models_dict['LGBM'][target].predict(X_valid)
    for target in targets
])

catboost_preds_valid = np.column_stack([
    models_dict['CatBoost'][target].predict(X_valid)
    for target in targets
])

In [None]:
num_combinations = 10000
weights_combinations = np.random.rand(num_combinations, 3)
weights_combinations /= np.sum(weights_combinations, axis=1, keepdims=True)

initial = np.array([1, 1.5, 2.25, 3.25, 4.5])
thresholds_per_day = np.tile(initial, (len(targets), 1))
best_weights_list = []

def round_with_thresholds(aqi_vals, th):
    return np.digitize(aqi_vals, np.sort(th))

n_days = len(targets)
cols = 3
rows = int(np.ceil(n_days / cols))
fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 5*rows))
axes = axes.flatten()

for day in range(n_days):
    best_accuracy = -np.inf

    for w in weights_combinations:
        raw_pred = (
            w[0] * xgb_preds_valid[:, day] +
            w[1] * lgbm_preds_valid[:, day] +
            w[2] * catboost_preds_valid[:, day]
        )
        temp_preds = round_with_thresholds(raw_pred, initial)
        acc = accuracy_score(y_valid.values[:, day], temp_preds)
        if acc > best_accuracy:
            best_accuracy = acc
            best_w = w.copy()
            best_raw_pred = raw_pred.copy()

    best_weights_list.append(best_w)
    thresholds_per_day[day] = initial 

    print(
        f"Day {day} ({targets[day]}): "
        f"Best weights = {[round(v,5) for v in best_w]}, "
        f"accuracy = {best_accuracy:.4f}"
    )

    y_true_cat = y_valid.values[:, day]
    y_pred_cat = round_with_thresholds(best_raw_pred, initial)
    classes = np.unique(np.concatenate([y_true_cat, y_pred_cat]))
    cm = confusion_matrix(y_true_cat, y_pred_cat, labels=classes)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    ax = axes[day]
    disp.plot(ax=ax, cmap='Blues', colorbar=True, xticks_rotation='vertical')
    ax.set_title(f'Confusion Matrix\nDay {day}')
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')

for ax in axes[n_days:]:
    fig.delaxes(ax)

plt.tight_layout()
plt.show()

In [None]:
xgb_preds_test = np.column_stack([
    models_dict['XGB'][target].predict(xgb.DMatrix(X_test))
    for target in targets
])

lgbm_preds_test = np.column_stack([
    models_dict['LGBM'][target].predict(X_test)
    for target in targets
])

catboost_preds_test = np.column_stack([
    models_dict['CatBoost'][target].predict(X_test)
    for target in targets
])

In [None]:
y_test_np = y_test.values.copy()
test_accuracy_per_day = []
test_precision_per_day = []
test_recall_per_day = []

ncols = 3
nrows = (len(targets) + ncols - 1) // ncols
fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 6, nrows * 6))
axes = axes.flatten()

for day in range(len(targets)):
    w_xgb, w_lgbm, w_cat = best_weights_list[day]
    final_preds_test = (
        w_xgb  * xgb_preds_test[:, day] +
        w_lgbm * lgbm_preds_test[:, day] +
        w_cat  * catboost_preds_test[:, day]
    )
    rounded_preds_test = round_with_thresholds(final_preds_test, thresholds_per_day[day])
    
    acc = accuracy_score(y_test_np[:, day], rounded_preds_test)
    prec = precision_score(y_test_np[:, day], rounded_preds_test, average='weighted', zero_division=0)
    rec = recall_score(y_test_np[:, day], rounded_preds_test, average='weighted', zero_division=0)
    print(f"Day {day}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")
    
    test_accuracy_per_day.append(acc)
    test_precision_per_day.append(prec)
    test_recall_per_day.append(rec)
    
    classes = np.unique(np.concatenate([y_test_np[:, day], rounded_preds_test]))
    cm = confusion_matrix(y_test_np[:, day], rounded_preds_test, labels=classes)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    
    ax = axes[day]
    disp.plot(ax=ax, cmap='Blues', colorbar=True, xticks_rotation='vertical')
    
    ax.set_title(f'Confusion Matrix\nDay {day}')
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')

for i in range(len(targets), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.savefig(models_folder + '/confusion_matrices.png', dpi=300)
plt.show()

# ⚠️ BELOW CODE IS TO SAVE MODELS, IF UNCOMMENT CAN CAUSE LOST OF PREVIOUS MODELS

In [None]:
# os.makedirs(models_folder, exist_ok=True)
# np.save(os.path.join(models_folder, 'best_weights.npy'), best_weights_list)
# np.save(os.path.join(models_folder, 'thresholds_per_day.npy'), thresholds_per_day)

# for model_name in ['XGB', 'LGBM', 'CatBoost']:
#     for target in targets:
#         model = models_dict[model_name][target]
#         model_filename = os.path.join(models_folder, f'{model_name}_{target}_model.pkl')
#         joblib.dump(model, model_filename)
#         print(f"Saved {model_filename}")