In [2]:
import pandas as pd

# Load the dataset from your data folder
df = pd.read_csv("data/weatherAUS.csv")

# Check basic shape
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (145460, 23)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


#### Step 1: Check for Missing Values

Understanding which columns have missing data helps determine what should be cleaned or dropped.



In [7]:
# Check how many missing values per column
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing > 0]


Pressure9am      15065
Pressure3pm      15028
WindDir9am       10566
WindGustDir      10326
WindGustSpeed    10263
Humidity3pm       4507
WindDir3pm        4228
Temp3pm           3609
RainTomorrow      3267
Rainfall          3261
RainToday         3261
WindSpeed3pm      3062
Humidity9am       2654
Temp9am           1767
WindSpeed9am      1767
MinTemp           1485
MaxTemp           1261
dtype: int64

#### Step 2: Drop Columns with Excessive Missing Data

The following features have more than 40% missing data and would reduce the dataset size significantly if we tried to impute them. Therefore, we remove them:
- Sunshine (69,835 missing)
- Evaporation (62,790)
- Cloud3pm (59,358)
- Cloud9am (55,888)

These features are dropped to maintain data quality and avoid introducing noise.


In [8]:
# List of high-missing columns we want to drop
to_drop = ['Sunshine', 'Evaporation', 'Cloud3pm', 'Cloud9am']

# Drop only the ones that exist
df.drop(columns=[col for col in to_drop if col in df.columns], inplace=True)


#### Step 3: Drop Rows with Missing Target

Rows missing the target variable (`RainTomorrow`) cannot be used for training or evaluation and are removed.



In [9]:
df.dropna(subset=['RainTomorrow'], inplace=True)


#### Step 4: Fill Remaining Missing Values

Numerical columns are filled with their mean, and categorical columns with their most frequent value (mode).


In [10]:
# Fill numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Fill categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


#### Step 5: Encode Categorical Variables

The target and binary features are converted to 1s and 0s. All other categorical features are one-hot encoded.


In [11]:
df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})
df['RainToday'] = df['RainToday'].map({'Yes': 1, 'No': 0})

# One-hot encode remaining categorical variables
df = pd.get_dummies(df, drop_first=True)


#### Step 6: Split Features and Target

Now that the dataset is clean and encoded, we separate the features (X) from the target variable (y).


In [12]:
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']


#### Step 7: Train/Test Split

Split the data into training and testing sets to evaluate performance.


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## 2. Modelling / Classification

In this section, we apply several supervised machine learning algorithms to classify whether it will rain tomorrow. The models are trained on the preprocessed data and evaluated using accuracy, confusion matrix, and classification reports. Three different classifiers are compared to assess performance.



### 2.1 Logistic Regression

Logistic Regression is a simple and interpretable linear model for binary classification. It models the probability of a binary response based on one or more predictor variables.


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

lr_model = LogisticRegression(max_iter=5000, solver='liblinear')
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.8455993530011604

Confusion Matrix:
 [[20962  1102]
 [ 3289  3086]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.91     22064
           1       0.74      0.48      0.58      6375

    accuracy                           0.85     28439
   macro avg       0.80      0.72      0.74     28439
weighted avg       0.84      0.85      0.83     28439



### 2.2 Random Forest

Random Forest is an ensemble method that builds multiple decision trees and merges them to get a more accurate and stable prediction. It works well on structured/tabular data.


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Reduce number of trees to speed things up
rf_model = RandomForestClassifier(n_estimators=30, max_depth=15, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.7900066809662787

Confusion Matrix:
 [[22030    34]
 [ 5938   437]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      1.00      0.88     22064
           1       0.93      0.07      0.13      6375

    accuracy                           0.79     28439
   macro avg       0.86      0.53      0.50     28439
weighted avg       0.82      0.79      0.71     28439



### 2.3 Support Vector Machine (SVM)

SVM aims to find a hyperplane that best separates the classes in the data. It is effective in high-dimensional spaces but can be slow on large datasets.


In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
