## Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline

In [None]:
import warnings

warnings.filterwarnings('ignore')

## Import dataset

In [None]:
df = pd.read_csv('weatherAUS.csv')

## Exploratory data analysis


Now, we will explore the data to gain insights about the data.

In [None]:
# view dimensions of dataset
df.shape

In [None]:
# preview the dataset
df.head()

In [None]:
# view summary of dataset
df.info()

We segregate the dataset into numerical and categorical variables.

In [None]:
df['Cloud3pm'].dtype

In [None]:
df['RainToday'].dtype

In [None]:
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

In [None]:
# check missing values in numerical variables
df[numerical].isnull().sum()

We can see that all the original numerical variables contain missing values.

We use median imputation.

In [None]:
# impute missing values in X_train and X_test with respective column median in X_train
for col in numerical:
  col_median=df[col].median()
  df[col].fillna(col_median, inplace=True)
df.info()

In [None]:
# Again, check missing values in numerical variables
df[numerical].isnull().sum()

Now, we can see that there are no missing values in the numerical columns.

After the above, we explore the categorical variables.

In [None]:
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :', categorical)

The number of labels within a categorical variable is known as **cardinality**. A high number of labels within a variable is known as **high cardinality**. High cardinality may pose some serious problems in the machine learning model. So, we will check for high cardinality.

In [None]:
# check for cardinality in categorical variables
for var in categorical:
    print(var, ' contains ', len(df[var].unique()), ' labels')

We can see that there is a `Date` variable which needs to be preprocessed.

All the other variables contain relatively smaller number of variables.

In [None]:
df['Date'].dtypes

We can see that the data type of `Date` variable is object. We will parse the date currently coded as object into datetime format.

In [None]:
# parse the dates, currently coded as strings, into datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# extract year from date
df['Year'] = df['Date'].dt.year
df['Year'].head()

In [None]:
# extract month from date
df['Month'] = df['Date'].dt.month
df['Month'].head()

In [None]:
# extract day from date
df['Day'] = df['Date'].dt.day
df['Day'].head()

In [None]:
# again view the summary of dataset
df.info()

We can see that there are three additional columns created from `Date` variable. We will drop the original `Date` variable from the dataset.

In [None]:
# drop the original Date variable
df.drop('Date', axis=1, inplace = True)

Now, we can see that the `Date` variable has been removed from the dataset.


In [None]:
# check for missing values in categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
df[categorical].isnull().sum()

We can see that all the variables except the first one contain missing values.

In [None]:
# print categorical variables containing missing values
categoricalmissing = [var for var in categorical if df[var].isnull().sum()!=0]
for c in categoricalmissing:
    missing_percentage = df[c].isnull().mean() * 100
    print(f"{c}: {missing_percentage:.2f}% missing")

In [None]:
# impute missing categorical variables with most frequent value for
# all except the last one
df['WindGustDir'].fillna(df['WindGustDir'].mode()[0], inplace=True)
df['WindDir9am'].fillna(df['WindDir9am'].mode()[0], inplace=True)
df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0], inplace=True)
df['RainToday'].fillna(df['RainToday'].mode()[0], inplace=True)

Since If the number of missing values in `RainTomorrow` is relatively small compared to the size of your dataset, and this is the test column, we are not going
to take the risk of imputing it. Rather, we are going to drop the all the rows which has missing values in this column.

In [None]:
df.dropna(subset=['RainTomorrow'], inplace=True)
df.info()

## Encode categorical variables

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']
print(categorical)

In [None]:
# Explicitly encode the RainToday, and RainTomorrow variables

# Define the mapping for RainToday and RainTomorrow
mapping = {'Yes': 1, 'No': 0}

# Explicitly encode RainToday and RainTomorrow variables
df['RainToday'] = df['RainToday'].map(mapping)
df['RainTomorrow'] = df['RainTomorrow'].map(mapping)


In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']
print(categorical)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoders = {}

# Encode each categorical variable
for var in categorical:
    le = LabelEncoder()
    df[var] = le.fit_transform(df[var])
    label_encoders[var] = le

# Print the label encodings
for var, le in label_encoders.items():
    print(f"{var} Label Encoding:")
    for label, code in zip(le.classes_, le.transform(le.classes_)):
        print(f"\t{label}: {code}")
    print()


In [None]:
df.columns

## Split data into separate training and test set

In [None]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


## Feature selection

In [None]:
# ref: https://medium.com/@Kavya2099/optimizing-performance-selectkbest-for-efficient-feature-selection-in-machine-learning-3b635905ed48

#using mutual_info_classif for classification problem MI
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X_train, y_train)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)

mi_scores # show a few features with their MI scores

In [None]:
# for classification problem
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2, f_classif

X_opt=SelectKBest(score_func=mutual_info_classif,k=5)
X_opt.fit(X_train, y_train)
names = X_opt.get_feature_names_out()
X_train = X_train[names]
X_test = X_test[names]

print(f'mutual_info_classif: {names}')
print(f'f_classif:', SelectKBest(score_func=f_classif,k=5).fit(X_train, y_train).get_feature_names_out())

In [None]:
X_train

In [None]:
# for regression problem
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
print(f'f_regression:', SelectKBest(score_func=f_regression,k=5).fit(X_train, y_train).get_feature_names_out())
print(f'mutual_info_regression:', SelectKBest(score_func=mutual_info_regression,k=5).fit(X_train, y_train).get_feature_names_out())

## Polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
poly.fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

This code is **adding polynomial features** to your dataset so a model can capture **nonlinear relationships** without changing the underlying algorithm.

Let’s unpack it step by step:

---

### **1. `PolynomialFeatures(2)`**

* From `sklearn.preprocessing`, this generates new features that are **all polynomial combinations** of the existing features up to the specified **degree** (here, `2`).
* **Degree = 2** means:

  * Keep the original features (degree 1)
  * Add squared terms (degree 2)
  * Add interaction terms (feature₁ × feature₂)
* If your original data has:

  $$
  X = [x_1, x_2]
  $$

  then with degree 2, you’ll get:

  $$
  [1, x_1, x_2, x_1^2, x_1x_2, x_2^2]
  $$

  (The `1` is the bias term if `include_bias=True`, which is default.)

---

### **2. `poly.fit(X_train)`**

* Learns how many output features will be generated based on the number of input features and the degree.
* Doesn’t change the data yet — just prepares the transformer.

---

### **3. `poly.transform(X_train)` & `poly.transform(X_test)`**

* Applies the transformation:

  * Takes each row in `X_train` / `X_test`
  * Generates the polynomial and interaction terms.
* This expands your feature space.

---

✅ **Why do this?**

* Many models (like **Linear Regression**, **Logistic Regression**, or even `SVC(kernel='linear')`) are inherently linear.
* By **manually adding polynomial features**, you let these models fit **curved decision boundaries** or nonlinear trends — similar to what an `SVC(kernel='poly')` would do, but explicitly in the feature space.

## MinMax scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model training

In [None]:
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression

# instantiate the model
logreg = LogisticRegression(solver='liblinear', random_state=0)

# fit the model
logreg.fit(X_train, y_train)


## Predict results

In [None]:
y_pred_test = logreg.predict(X_test)

## Check accuracy score

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.2f}%'. format(accuracy_score(y_test, y_pred_test)*100))

### Compare the train-set and test-set accuracy


Now, we will compare the train-set and test-set accuracy to check for overfitting.

In [None]:
y_pred_train = logreg.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.4f}%'. format(accuracy_score(y_train, y_pred_train)*100))

## Confusion matrix

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

## Check classification report

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print(
    f"Accuracy Score: {accuracy_score(y_test, y_pred_test):.4f}\n"
    f"Precision Score: {precision_score(y_test, y_pred_test):.4f}\n"
    f"Recall Score: {recall_score(y_test, y_pred_test):.4f}\n"
    f"F1 Score: {f1_score(y_test, y_pred_test):.4f}"
)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test))

## How do we plot

In [None]:
# np.concatenate(X_train,  X_test, axis=0)
plt.scatter(X['Humidity3pm'], X['Sunshine'],  c= logreg.predict(np.concatenate((X_train,  X_test), axis=0)))
plt.xlabel('Humidity3pm')
plt.ylabel('Sunshine')

In [None]:
# do pca to reduce the number of features
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_train)
plt.scatter(pca.transform(X_train)[:,0], pca.transform(X_train)[:,1], c=y_train)