1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle


2. Load and Preprocess Data

In [2]:
# Load dataset and clean column names
data = pd.read_csv("Rainfall.csv").rename(columns=lambda x: x.strip()).drop(columns=["day"])

# Handle missing values
data.fillna({
    "winddirection": data["winddirection"].mode()[0],
    "windspeed": data["windspeed"].median()
}, inplace=True)

# Encode target variable
data["rainfall"] = data["rainfall"].map({"yes": 1, "no": 0})

# Drop highly correlated or unnecessary columns
data.drop(columns=['maxtemp', 'temparature', 'mintemp'], inplace=True)

# Display data info
print("Data Info:")
data.info()


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pressure       366 non-null    float64
 1   dewpoint       366 non-null    float64
 2   humidity       366 non-null    int64  
 3   cloud          366 non-null    int64  
 4   rainfall       366 non-null    int64  
 5   sunshine       366 non-null    float64
 6   winddirection  366 non-null    float64
 7   windspeed      366 non-null    float64
dtypes: float64(5), int64(3)
memory usage: 23.0 KB


3. Balance Classes

In [3]:
# Separate majority and minority classes
df_majority = data[data["rainfall"] == 1]
df_minority = data[data["rainfall"] == 0]

print(f"Majority class count: {df_majority.shape[0]}")
print(f"Minority class count: {df_minority.shape[0]}")

# Downsample majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42)

print("Balanced class distribution:")
print(df_balanced["rainfall"].value_counts())


Majority class count: 249
Minority class count: 117
Balanced class distribution:
rainfall
1    117
0    117
Name: count, dtype: int64


4. Feature Selection and Split

In [4]:
# Define features and target
X = df_balanced.drop("rainfall", axis=1)
y = df_balanced["rainfall"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


5. Model Training and Hyperparameter Tuning

In [5]:
# Define parameter grid for Random Forest
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model
grid.fit(X_train, y_train)

# Best estimator
best_rf = grid.best_estimator_
print("Best Parameters:", grid.best_params_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}


6. Model Evaluation

In [6]:
# Cross-validation scores
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5)
print(f"Cross-validation Mean Score: {cv_scores.mean():.4f}")

# Predict on test set
y_pred = best_rf.predict(X_test)

# Evaluation metrics
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Cross-validation Mean Score: 0.8189
Test Set Accuracy: 0.7446808510638298
Confusion Matrix:
 [[17  7]
 [ 5 18]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.71      0.74        24
           1       0.72      0.78      0.75        23

    accuracy                           0.74        47
   macro avg       0.75      0.75      0.74        47
weighted avg       0.75      0.74      0.74        47



7. Model Saving

In [7]:
# Save the trained model and feature names
model_data = {"model": best_rf, "features": X.columns.tolist()}
with open("rainfall_prediction_model.pkl", "wb") as f:
    pickle.dump(model_data, f)


8. Model Loading and Prediction

In [8]:
# Load the trained model and feature names
with open("rainfall_prediction_model.pkl", "rb") as f:
    model_data = pickle.load(f)
model, feature_names = model_data["model"], model_data["features"]

# Prepare input data for prediction
input_data = pd.DataFrame([(
    1015.9, 19.9, 95, 81, 0.0, 40.0, 13.7
)], columns=feature_names)

# Make prediction
prediction = model.predict(input_data)[0]
print("Prediction:", "Rainfall" if prediction == 1 else "No Rainfall")


Prediction: Rainfall
