In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score

In [5]:
# load dataset
data = pd.read_csv('./weather.csv')

# filter NaN
data = data.dropna()

# convert categorical data to numeric
data = pd.get_dummies(data, drop_first=True)
print(data.head(200))

Saving weather.csv to weather.csv


ValueError: Invalid file path or buffer object type: <class 'dict'>

In [None]:
### preprocess data
X = data[["Temperature", "Humidity", "WindSpeed", "WindDir", "Pressure"]]
y = data["Precipitation"]
y = (y > 0).astype(int)

# split data to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()


In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


print(X_train, '\n', X_test, y_train, '\n', y_test)

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:\n', class_report)

In [None]:
# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Rain', 'Rain'], yticklabels=['No Rain', 'Rain'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# PLot roc curve
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', label=f"ROC curve (AUC = {roc_auc: .2f})")
plt.plot([0,1], [0,1], color='red', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve")
plt.legend()
plt.show()

In [None]:
import joblib

In [None]:
# save the model
joblib.dump((model, scaler), 'rain_prediction.pkl')
print('Model saved successfully')

In [None]:
loaded_model, scaler = joblib.load("rain_prediction.pkl")
print("Model loaded successfully")

# Example test data
new_data = data[["Temperature", "Humidity", "WindSpeed", "WindDir", "CloudCover", "Pressure"]]

# Standardize the new data using the same scaler used for training
scaler = StandardScaler()
new_data_standardized = scaler.fit_transform(new_data)

# Make predictions
predictions = model.predict(new_data_standardized)
probabilities = model.predict_proba(new_data_standardized)[:, 1]

print('Predictions:', predictions)
print('Confidence:', probabilities)
