## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import joblib

## Dataset

In [2]:
weather = pd.read_csv("weather_data_35_years.csv", index_col="time")

In [3]:
weather.index = pd.to_datetime(weather.index)

In [4]:
# Adding lag features
weather['target'] = weather['weather_code (wmo code)'].shift(-1)
weather = weather.iloc[:-1].copy()
weather['prev_code'] = weather['weather_code (wmo code)'].shift(1)
weather = weather.dropna(subset=['prev_code'])
weather['prev_code'] = weather['prev_code'].astype(int)

In [5]:
predictors = [
 'temperature_2m_max (°C)',
 'temperature_2m_min (°C)',
 'wind_speed_10m_max (km/h)',
 'cloud_cover_mean (%)',
 'relative_humidity_2m_mean (%)',
 'pressure_msl_mean (hPa)',
 'cloud_cover_max (%)',
 'dew_point_2m_mean (°C)',
 'temperature_2m_mean (°C)',
 'apparent_temperature_mean (°C)',
 'wind_speed_10m_mean (km/h)',
 'precipitation_sum (mm)',
 'weather_code (wmo code)',
 'prev_code'
]

In [6]:
predictors = [c for c in predictors if c in weather.columns]

## Split

In [7]:
train = weather.loc[:"2017-05-31"].copy()
test  = weather.loc["2017-06-01":].copy()

X_train = train[predictors].copy()
y_train = train['target'].astype(int).copy()

X_test  = test[predictors].copy()
y_test  = test['target'].astype(int).copy()

## Model_1 Binary Predictor

In [9]:
y_train_s1 = (y_train == 3).astype(int)  
y_test_s1  = (y_test == 3).astype(int)

sw_s1 = compute_sample_weight(class_weight='balanced', y=y_train_s1)

model_stage1 = XGBClassifier(
    eval_metric='logloss',
    random_state=42,
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    tree_method='hist'
)

model_stage1.fit(X_train, y_train_s1, sample_weight=sw_s1)
y_pred_s1 = model_stage1.predict(X_test)
print("Stage 1 (is code 3?) Classification Report:")
print(classification_report(y_test_s1, y_pred_s1, digits=3))
print("Stage1 accuracy:", accuracy_score(y_test_s1, y_pred_s1))

Stage 1 (is code 3?) Classification Report:
              precision    recall  f1-score   support

           0      0.864     0.743     0.799      2135
           1      0.509     0.695     0.588       817

    accuracy                          0.730      2952
   macro avg      0.687     0.719     0.693      2952
weighted avg      0.766     0.730     0.741      2952

Stage1 accuracy: 0.7300135501355014


## Model_2 Other Class Predictor

In [11]:
mask_train_s2 = (y_train != 3)
mask_test_s2  = (y_test != 3)

X_train_s2 = X_train[mask_train_s2].copy()
y_train_s2 = y_train[mask_train_s2].copy()

X_test_s2 = X_test[mask_test_s2].copy()
y_test_s2 = y_test[mask_test_s2].copy()

assert len(y_train_s2) > 0, "No training rows for Stage 2."

le_s2 = LabelEncoder()
y_train_s2_enc = le_s2.fit_transform(y_train_s2)
y_test_s2_enc  = le_s2.transform(y_test_s2)

sw_s2 = compute_sample_weight(class_weight='balanced', y=y_train_s2_enc)

model_stage2 = XGBClassifier(
    eval_metric='mlogloss',
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    tree_method='hist'
)
model_stage2.fit(X_train_s2, y_train_s2_enc, sample_weight=sw_s2)
y_pred_s2_enc = model_stage2.predict(X_test_s2)
y_pred_s2 = le_s2.inverse_transform(y_pred_s2_enc)
print("\nStage 2 (which non-3 class?) Classification Report:")
print(classification_report(y_test_s2, y_pred_s2, digits=3))


Stage 2 (which non-3 class?) Classification Report:
              precision    recall  f1-score   support

           0      0.125     0.083     0.100        12
           1      0.160     0.342     0.218        76
           2      0.253     0.548     0.346       126
          51      0.570     0.303     0.396       778
          53      0.170     0.155     0.162       291
          55      0.057     0.101     0.073       109
          61      0.189     0.280     0.226       257
          63      0.430     0.437     0.433       403
          65      0.276     0.096     0.143        83

    accuracy                          0.302      2135
   macro avg      0.248     0.261     0.233      2135
weighted avg      0.370     0.302     0.313      2135



## Improve stage 2 aaccuracy