In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [85]:
rainfall = pd.read_csv('/home/charles/Desktop/UG/Rainfall.csv')
rainfall['is_rainy_season'].value_counts()
rainfall['version'].value_counts()

version
final       24165
prelim         60
forecast       15
Name: count, dtype: int64

In [86]:
rainfall['high_rainfall_flag'].value_counts()

high_rainfall_flag
0    14377
1     9863
Name: count, dtype: int64

In [87]:
drop_cols = ["date", "adm_level", "adm_id", "PCODE", "Name", "Municipality", "version"]
rainfall = rainfall.drop(columns=drop_cols)

In [88]:
X = rainfall.drop(columns=["high_rainfall_flag"])
y = rainfall["high_rainfall_flag"]


In [89]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [90]:
smote = SMOTE(
    sampling_strategy='auto',       # make both classes equal
    random_state=42,
    k_neighbors=5                   # default, usually good
)

In [91]:
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE - training set class distribution:")
print(pd.Series(y_train_smote).value_counts(normalize=True).round(4))
print(f"New training samples: {len(y_train_smote)}\n")

After SMOTE - training set class distribution:
high_rainfall_flag
1    0.5
0    0.5
Name: proportion, dtype: float64
New training samples: 23004



In [92]:
# neg_count = sum(y_train == 0)
# pos_count = sum(y_train == 1)
# scale_pos_weight = neg_count / pos_count if pos_count != 0 else 1  # Avoid division by zero

In [93]:
model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    objective='binary:logistic',
    eval_metric='logloss',           # or 'aucpr' is even better for imbalance
    random_state=42,
    n_estimators=100,                # ← add if missing
    max_depth=6,                     # ← tune these later
    learning_rate=0.1
)

In [94]:
# model = XGBClassifier(
#     n_estimators=300,
#     learning_rate=0.05,
#     max_depth=5,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     objective="binary:logistic",
#     eval_metric="logloss",
#     random_state=42
# )


In [95]:
print("Training XGBoost...")
model.fit(X_train_smote, y_train_smote)
print("Training completed!\n")

Training XGBoost...
Training completed!



In [96]:
print("Training finished! Number of boosting rounds:", model.best_iteration if hasattr(model, 'best_iteration') else model.n_estimators)

Training finished! Number of boosting rounds: 100


In [97]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [98]:
# Metrics
print("=== Final Results on Original Test Set ===")
print(classification_report(y_test, y_pred, digits=4))

print(f"ROC-AUC:       {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"Accuracy:      {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

=== Final Results on Original Test Set ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      2875
           1     1.0000    1.0000    1.0000      1973

    accuracy                         1.0000      4848
   macro avg     1.0000    1.0000    1.0000      4848
weighted avg     1.0000    1.0000    1.0000      4848

ROC-AUC:       1.0000
Accuracy:      1.0000

Confusion Matrix:
[[2875    0]
 [   0 1973]]


In [99]:
# print("\nResults on test set:")
# print(classification_report(y_test, y_pred))
# print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

In [100]:
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
