In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [4]:
rainfall = pd.read_csv('/home/charles/Desktop/UG/Rainfall.csv')
rainfall['is_rainy_season'].value_counts()
rainfall['version'].value_counts()

version
final       24165
prelim         60
forecast       15
Name: count, dtype: int64

In [5]:
rainfall.rename(columns={'Unnamed: 0':'ID'})

Unnamed: 0,ID,date,adm_level,adm_id,PCODE,Name,Municipality,n_pixels,rfh,rfh_avg,...,dekad_of_month,is_rainy_season,sin_month,cos_month,drought_flag,high_rainfall_flag,r3h_6m_ma,r3h_12m_ma,rfh_3d_std,rfh_rate_change
0,203616,1981-01-01,2,1009451,GH0705,Ada East,Greater Accra,7.0,2.857143,3.504762,...,1,0,0.500000,0.866025,0,0,208.147033,208.147033,0.000000,2.857143
1,203617,1981-01-11,2,1009451,GH0705,Ada East,Greater Accra,7.0,2.285714,2.633333,...,2,0,0.500000,0.866025,0,0,208.147033,208.147033,0.404061,-0.571429
2,203618,1981-01-21,2,1009451,GH0705,Ada East,Greater Accra,7.0,6.000000,5.171429,...,3,0,0.500000,0.866025,0,1,208.147033,208.147033,2.000000,3.714286
3,203619,1981-02-01,2,1009451,GH0705,Ada East,Greater Accra,7.0,2.571429,3.576191,...,1,0,0.866025,0.500000,0,0,208.147033,208.147033,2.066908,-3.428571
4,203620,1981-02-11,2,1009451,GH0705,Ada East,Greater Accra,7.0,7.285714,7.752381,...,2,0,0.866025,0.500000,0,0,208.147033,208.147033,2.436960,4.714285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24235,357131,2025-10-01,2,1009545,GH0728,Tema West Municipal,Greater Accra,4.0,57.500000,28.366667,...,1,1,-0.866025,0.500000,0,1,282.500000,204.166667,12.273786,19.750000
24236,357132,2025-10-11,2,1009545,GH0728,Tema West Municipal,Greater Accra,4.0,61.000000,33.058334,...,2,1,-0.866025,0.500000,0,1,286.861111,206.743056,12.535782,3.500000
24237,357133,2025-10-21,2,1009545,GH0728,Tema West Municipal,Greater Accra,4.0,23.000000,30.616667,...,3,1,-0.866025,0.500000,0,0,288.708333,209.013889,21.001984,-38.000000
24238,357134,2025-11-01,2,1009545,GH0728,Tema West Municipal,Greater Accra,4.0,26.250000,12.725000,...,1,0,-0.500000,0.866025,0,1,289.888889,211.201389,21.063891,3.250000


In [7]:
rainfall['high_rainfall_flag'].value_counts()

high_rainfall_flag
0    14377
1     9863
Name: count, dtype: int64

In [8]:
drop_cols = ["date", "adm_level", "adm_id", "PCODE", "Name", "Municipality", "version"]
rainfall = rainfall.drop(columns=drop_cols)

In [9]:
X = rainfall.drop(columns=["high_rainfall_flag"])
y = rainfall["high_rainfall_flag"]


In [None]:
# Split data into two halves (first half for training/testing, second half holdout)

X_first_half, X_second_half, y_first_half, y_second_half = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_first_half, y_first_half, test_size=0.3, random_state=42, stratify=y_first_half
)


In [12]:
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [13]:
print("After SMOTE - training set class distribution:")
print(pd.Series(y_train_smote).value_counts(normalize=True).round(4))
print(f"New training samples: {len(y_train_smote)}\n")

After SMOTE - training set class distribution:
high_rainfall_flag
1    0.5
0    0.5
Name: proportion, dtype: float64
New training samples: 10064



In [None]:
# Training XGBoost on the SMOTE-resampled training set
model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train_smote, y_train_smote)

In [None]:
# Evaluate on first half's test set (not SMOTE'd)
y_pred_test = model.predict(X_test)
print("Classification Report on First Half Test Set:")
print(classification_report(y_test, y_pred_test))


Classification Report on First Half Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2156
           1       1.00      1.00      1.00      1480

    accuracy                           1.00      3636
   macro avg       1.00      1.00      1.00      3636
weighted avg       1.00      1.00      1.00      3636



In [None]:
# Evaluate on untouched second half (holdout)
y_pred_holdout = model.predict(X_second_half)
print("Classification Report on Holdout Set:")
print(classification_report(y_second_half, y_pred_holdout))

Classification Report on Holdout Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7189
           1       1.00      1.00      1.00      4931

    accuracy                           1.00     12120
   macro avg       1.00      1.00      1.00     12120
weighted avg       1.00      1.00      1.00     12120



In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# 1️⃣ Metrics on First Half Test Set
y_pred_test = model.predict(X_test)
y_pred_test_proba = model.predict_proba(X_test)[:, 1]  # probability for positive class

print("=== Results on First Half Test Set ===")
print(classification_report(y_test, y_pred_test, digits=4))
print(f"ROC-AUC:  {roc_auc_score(y_test, y_pred_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\n" + "="*50 + "\n")

# 2️⃣ Metrics on Holdout Second Half
y_pred_holdout = model.predict(X_second_half)
y_pred_holdout_proba = model.predict_proba(X_second_half)[:, 1]

print("=== Results on Holdout Set ===")
print(classification_report(y_second_half, y_pred_holdout, digits=4))
print(f"ROC-AUC:  {roc_auc_score(y_second_half, y_pred_holdout_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_second_half, y_pred_holdout):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_second_half, y_pred_holdout))


=== Results on First Half Test Set ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      2156
           1     1.0000    1.0000    1.0000      1480

    accuracy                         1.0000      3636
   macro avg     1.0000    1.0000    1.0000      3636
weighted avg     1.0000    1.0000    1.0000      3636

ROC-AUC:  1.0000
Accuracy: 1.0000
Confusion Matrix:
[[2156    0]
 [   0 1480]]


=== Results on Holdout Set ===
              precision    recall  f1-score   support

           0     0.9999    0.9997    0.9998      7189
           1     0.9996    0.9998    0.9997      4931

    accuracy                         0.9998     12120
   macro avg     0.9997    0.9998    0.9997     12120
weighted avg     0.9998    0.9998    0.9998     12120

ROC-AUC:  1.0000
Accuracy: 0.9998
Confusion Matrix:
[[7187    2]
 [   1 4930]]
