In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import joblib
import numpy as np

In [2]:
url = "https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Rainfall%20Forecast/Rainfall.csv?raw=true"
data = pd.read_csv(url)

In [3]:
# Clean column names by stripping leading/trailing whitespace
data.columns = data.columns.str.strip()


In [4]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [5]:
data = data.ffill().bfill()

In [6]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [26]:
# Task 1: Predict whether it will rain tomorrow (classification)
X_classification = data.drop(['RainTomorrow'], axis=1)
y_classification = data['RainTomorrow']

In [27]:
# Split the dataset into training and testing sets for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

In [9]:
# Standardize the feature variables
scaler_cls = StandardScaler()
X_train_cls = scaler_cls.fit_transform(X_train_cls)
X_test_cls = scaler_cls.transform(X_test_cls)


In [10]:
# Train the Logistic Regression model for classification
model_cls = LogisticRegression()
model_cls.fit(X_train_cls, y_train_cls)



In [11]:
# Make predictions on the testing set for classification
y_pred_cls = model_cls.predict(X_test_cls)

In [12]:
# Evaluate the classification model
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
report_cls = classification_report(y_test_cls, y_pred_cls)

In [13]:
print("Classification Model Accuracy: {:.2f}%".format(accuracy_cls * 100))
print("Classification Report:")
print(report_cls)

Classification Model Accuracy: 85.40%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1304
           1       0.75      0.54      0.62       381

    accuracy                           0.85      1685
   macro avg       0.81      0.74      0.77      1685
weighted avg       0.85      0.85      0.84      1685



In [20]:
# Task 2: Predict the amount of rainfall (regression)
# Prepare data for regression (exclude rows with zero RainTomorrow)
X_regression = data.drop(['RainTomorrow'], axis=1)
y_regression = data['RainTomorrow'].astype(float)


In [21]:
# Split the dataset into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)

In [22]:
# Standardize the feature variables
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)

In [23]:
# Train the Linear Regression model for regression
model_reg = LinearRegression()
model_reg.fit(X_train_reg, y_train_reg)

# Make predictions on the testing set for regression
y_pred_reg = model_reg.predict(X_test_reg)

In [24]:
# Evaluate the regression model
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
rmse_reg = mse_reg ** 0.5

print("Regression Model RMSE: {:.2f}".format(rmse_reg))

Regression Model RMSE: 0.35


In [25]:
# Save the trained models and preprocessing objects
joblib.dump(model_cls, 'logistic_regression_rainfall_model.pkl')
joblib.dump(scaler_cls, 'scaler_cls.pkl')
joblib.dump(model_reg, 'linear_regression_rainfall_model.pkl')
joblib.dump(scaler_reg, 'scaler_reg.pkl')
for column, encoder in label_encoders.items():
    joblib.dump(encoder, f'label_encoder_{column}.pkl')

print("Models and preprocessing objects saved.")

Models and preprocessing objects saved.
