In [None]:
import pandas as pd
import numpy as np
# Utilising AI to make the code.
# Importing all the scikit-learn tools I need for splitting the data
# preprocessing, building ML pipelines, models, and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)


In [None]:
# Loading the dataset
# I am using an excel file here containing student academic and demographic data
data = pd.read_excel("Maths.xlsx")

# basic checks to understand the dataset
print("Dataset shape:", data.shape)
print("Columns:", list(data.columns))

# Preview the first few rows
data.head()


Dataset shape: (397, 33)
Columns: ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [None]:
# CREATING THE TARGET VARIABLE 
# a binary target is created: 1 = pass (final grade G3 >= 10), 0 = fail (G3 < 10)
# This makes the problem a binary classification
data["pass"] = (data["G3"] >= 10).astype(int)

# Remobes G1, G2, G3 to avoid data leakage
# G1 and G2  are earlier grades which correlate highly with G3
# Including them would make the prediction unrealistically easy
X = data.drop(columns=["G1", "G2", "G3", "pass"])
y = data["pass"]

print("Class balance:")
print(y.value_counts(normalize=True)) # Checking for imbalance


Class balance:
pass
1    0.667506
0    0.332494
Name: proportion, dtype: float64


In [None]:
# SEPERATING CATEGORICAL AND NUMERIC FEATURES
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical features:", categorical_features)
print("Numeric features:", numeric_features)


Categorical features: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
Numeric features: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


In [None]:
# PREPROCESSING STEPS
# Numerical features are scaled because many ML models perform better
# When numeric values are on a similar scale
numeric_transformer = StandardScaler()

# Categorical features are one-hot encoded to convert text labels
# Into numeric columns that ML models can process
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# ColumnTransformer allows me to apply different preprocessing
# To numerical and categorical features in one step.
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
#TRAIN-TEST SPLIT
# I use stratify=y to maintain the same proportion of pass/fail in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # maintains pass/fail ratio
)


In [None]:
# LOGESTIC REGRESSION MODEL
# I wrap the preprocessing + model into a pipeline
# This ensures the exact same transformations are applied to
# Both training and test data
log_reg_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=2000))
    ]
)

# Random Forest model
rf_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(
            n_estimators=300, # Number of trees
            random_state=42
        ))
    ]
)


In [23]:
print("Training Logistic Regression...")
log_reg_clf.fit(X_train, y_train)

print("Training Random Forest...")
rf_clf.fit(X_train, y_train)


Training Logistic Regression...
Training Random Forest...


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# MODEL EVALUATION FUNCTION
# This function calculates acurracy, precision, recall and F1 score
# For each model and prints out the confusion matrix and classification report
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)

    # Computing the evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== {name} ===")
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print("\nConfusion matrix:")
    print(cm)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    return {
        "model": name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }


In [None]:
#RUNNING BOTH MODELS AND COMPARING RESULTS
results = []
results.append(evaluate_model("Logistic Regression", log_reg_clf, X_test, y_test))
results.append(evaluate_model("Random Forest", rf_clf, X_test, y_test))


=== Logistic Regression ===
Accuracy:  0.675
Precision: 0.721
Recall:    0.830
F1 Score:  0.772

Confusion matrix:
[[10 17]
 [ 9 44]]

Classification report:
              precision    recall  f1-score   support

           0       0.53      0.37      0.43        27
           1       0.72      0.83      0.77        53

    accuracy                           0.68        80
   macro avg       0.62      0.60      0.60        80
weighted avg       0.66      0.68      0.66        80

=== Random Forest ===
Accuracy:  0.650
Precision: 0.681
Recall:    0.887
F1 Score:  0.770

Confusion matrix:
[[ 5 22]
 [ 6 47]]

Classification report:
              precision    recall  f1-score   support

           0       0.45      0.19      0.26        27
           1       0.68      0.89      0.77        53

    accuracy                           0.65        80
   macro avg       0.57      0.54      0.52        80
weighted avg       0.60      0.65      0.60        80



In [None]:
# Creating a results DataFrame for easy comparison
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,model,accuracy,precision,recall,f1
0,Logistic Regression,0.675,0.721311,0.830189,0.77193
1,Random Forest,0.65,0.681159,0.886792,0.770492
