In [1]:
# import neccessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

In [2]:
# Load Dataset
df = pd.read_csv("C:\\Users\\prasa\\Downloads\\StudentsPerformance.csv")
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
# Create 'pass' label based on average score >= 50

df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
df['pass'] = (df['average_score'] >= 50).astype(int)

In [4]:
# Encode categorical features

le = LabelEncoder()
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [5]:
# Feature and target split

X = df.drop(['average_score', 'pass'], axis=1)
y = df['pass']

In [6]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Feature scaling

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Define five ML classification models (Logistic Regression, Decision Tree, Random Forest, SVM, and KNN)

models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier()
}

In [11]:
# Loop through each model:

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    print(f"--- {name} ---")
    print("Classification Report:", classification_report(y_test, y_pred))
    print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
    print(" ")

--- Logistic Regression ---
Classification Report:               precision    recall  f1-score   support

           0       1.00      0.89      0.94        27
           1       0.98      1.00      0.99       173

    accuracy                           0.98       200
   macro avg       0.99      0.94      0.97       200
weighted avg       0.99      0.98      0.98       200

Confusion Matrix: [[ 24   3]
 [  0 173]]
ROC AUC Score: 0.9993577392421323
 
--- Decision Tree ---
Classification Report:               precision    recall  f1-score   support

           0       0.96      0.93      0.94        27
           1       0.99      0.99      0.99       173

    accuracy                           0.98       200
   macro avg       0.98      0.96      0.97       200
weighted avg       0.98      0.98      0.98       200

Confusion Matrix: [[ 25   2]
 [  1 172]]
ROC AUC Score: 0.9600727895525584
 
--- Random Forest ---
Classification Report:               precision    recall  f1-score   suppo

In [12]:
# Tune Random Forest

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)
print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Best Score:", grid_rf.best_score_)

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.99


In [14]:
# Evaluate on test set

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("Final Model Evaluation:", classification_report(y_test, y_pred_rf))

Final Model Evaluation:               precision    recall  f1-score   support

           0       1.00      0.96      0.98        27
           1       0.99      1.00      1.00       173

    accuracy                           0.99       200
   macro avg       1.00      0.98      0.99       200
weighted avg       1.00      0.99      0.99       200

