In [2]:

import pandas as pd
import numpy as np

data = pd.read_csv("Rainfall.csv")
data.head()


Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7


In [3]:

# Convert rainfall column to binary
data['rainfall'] = data['rainfall'].astype(str).str.strip().str.lower().map({'yes': 1, 'no': 0})

# Drop 'day' column if present
data = data.drop(columns=["day"], errors='ignore')
data.head()


Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1025.9,19.9,18.3,16.8,13.1,72,49,1,9.3,80.0,26.3
1,1022.0,21.7,18.9,17.2,15.6,81,83,1,0.6,50.0,15.3
2,1019.7,20.3,19.3,18.0,18.4,95,91,1,0.0,40.0,14.2
3,1018.9,22.3,20.6,19.1,18.8,90,88,1,1.0,50.0,16.9
4,1015.9,21.3,20.7,20.2,19.9,95,81,1,0.0,40.0,13.7


In [4]:

from sklearn.impute import SimpleImputer

# Define features and target
X = data.drop(columns=['rainfall'])
y = data['rainfall']

# Handle missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


In [5]:

from sklearn.model_selection import train_test_split

# Split into train and test (stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [6]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\n🔍 {name}")
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print("✅ Mean CV Accuracy:", round(scores.mean(), 4))

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("🎯 Test Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("📄 Classification Report:\n", classification_report(y_test, y_pred))



🔍 Random Forest
✅ Mean CV Accuracy: 0.8015
🎯 Test Accuracy: 0.8108
📊 Confusion Matrix:
 [[14 10]
 [ 4 46]]
📄 Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.58      0.67        24
           1       0.82      0.92      0.87        50

    accuracy                           0.81        74
   macro avg       0.80      0.75      0.77        74
weighted avg       0.81      0.81      0.80        74


🔍 Gradient Boosting
✅ Mean CV Accuracy: 0.7947
🎯 Test Accuracy: 0.7838
📊 Confusion Matrix:
 [[14 10]
 [ 6 44]]
📄 Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.58      0.64        24
           1       0.81      0.88      0.85        50

    accuracy                           0.78        74
   macro avg       0.76      0.73      0.74        74
weighted avg       0.78      0.78      0.78        74


🔍 Logistic Regression
✅ Mean CV Accuracy: 0.7946
🎯 Test Accuracy: 0.8378


In [7]:

from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print("🚀 XGBoost Test Accuracy:", accuracy_score(y_test, y_pred))
print("📄 Classification Report:\n", classification_report(y_test, y_pred))


ModuleNotFoundError: No module named 'xgboost'