In [3]:
# 📘 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [None]:
# 📂 2. Load Dataset from URL
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/_0eYOqji3unP1tDNKWZMjg/weatherAUS-2.csv
df = pd.read_csv(url)

df.head()


NameError: name 'weatherAUS' is not defined

In [None]:
# Show data info and summary
df.info()
df.describe()


In [None]:
# Missing values count
df.isnull().sum()


In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1, inplace=True)


In [None]:
df.dropna(subset=['RainTomorrow'], inplace=True)
df.fillna(df.mean(numeric_only=True), inplace=True)
df['RainToday'] = df['RainToday'].fillna('No')


In [None]:
df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})
df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})


In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df['Humidity3pm'], bins=30, kde=True)
plt.title("Humidity at 3pm Distribution")
plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
features = ['Humidity3pm', 'Pressure9am', 'RainToday', 'WindSpeed3pm']
X = df[features]
y = df['RainTomorrow']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Logistic Regression
lr_model = LogisticRegression(max_iter=500)
lr_model.fit(X_train, y_train)


In [None]:
# Random Forest predictions
y_pred_rf = rf_model.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Logistic Regression predictions
y_pred_lr = lr_model.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred_lr)

# Accuracy
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# Classification report for Random Forest
print("\nRandom Forest Report:\n", classification_report(y_test, y_pred_rf))

# True Positive Rate for RF
TP = cm_rf[1][1]
FN = cm_rf[1][0]
TPR = (TP / (TP + FN)) * 100
print("True Positive Rate (RF):", round(TPR, 2), "%")

# Feature importance plot
importances = rf_model.feature_importances_
sns.barplot(x=importances, y=features)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()
