# Machine Learning Classification models and Evaluation metrics

## Importing Libraries

In [1]:
%pip install scikit-learn



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

import joblib
import os


## Exploratory Data Analysis

In [3]:
dataset = pd.read_csv("https://drive.google.com/uc?export=download&id=19a8CJYGaxwRly083AXsUE7-PRCogiTP-")

In [4]:
dataset.head(10)

Unnamed: 0,student_id,age,gender,grade_level,study_hours_per_day,uses_ai,ai_usage_time_minutes,ai_tools_used,ai_usage_purpose,ai_dependency_score,...,concept_understanding_score,study_consistency_index,improvement_rate,sleep_hours,social_media_hours,tutoring_hours,class_participation_score,final_score,passed,performance_category
0,1,20,Female,1st Year,2.5,1,170,,Exam Prep,10,...,4,9.0,8.1,7.2,1.5,4.6,6,36.8,0,Low
1,2,17,Male,12th,3.4,1,123,,Notes,4,...,8,8.5,13.8,7.6,5.1,2.7,6,65.5,1,Medium
2,3,24,Male,3rd Year,0.8,0,35,Copilot,Doubt Solving,8,...,7,2.1,39.1,5.7,0.2,4.0,3,66.3,1,Medium
3,4,21,Female,12th,4.4,0,45,ChatGPT+Gemini,Notes,3,...,5,6.7,25.2,8.2,4.2,2.9,2,69.5,1,Medium
4,5,18,Other,3rd Year,3.5,1,21,ChatGPT+Gemini,Coding,2,...,5,5.4,2.7,8.7,0.2,2.9,2,49.7,1,Low
5,6,20,Male,1st Year,5.1,1,34,Copilot,Coding,10,...,9,8.0,13.4,5.5,3.7,2.1,4,77.9,1,High
6,7,23,Female,1st Year,2.0,1,112,ChatGPT+Gemini,,8,...,9,9.6,3.7,5.8,1.6,3.4,5,82.2,1,High
7,8,16,Male,3rd Year,5.5,0,174,ChatGPT,Homework,4,...,6,4.3,13.2,6.0,0.4,2.5,6,52.7,1,Medium
8,9,20,Female,10th,6.0,1,32,Copilot,,9,...,9,7.8,18.8,5.4,1.6,4.2,4,81.7,1,High
9,10,24,Female,1st Year,5.6,0,140,,Doubt Solving,3,...,7,2.9,38.5,7.2,0.7,4.6,1,66.7,1,Medium


In [5]:
print("Dataset Shape: ", dataset.shape)

Dataset Shape:  (8000, 26)


In [6]:
# Checking missing values and dataset statistics

print(f"\n\n Missing data found in dataset = ", dataset.isnull().sum(), "\n\n")




 Missing data found in dataset =  student_id                            0
age                                   0
gender                                0
grade_level                           0
study_hours_per_day                   0
uses_ai                               0
ai_usage_time_minutes                 0
ai_tools_used                      1362
ai_usage_purpose                   1346
ai_dependency_score                   0
ai_generated_content_percentage       0
ai_prompts_per_week                   0
ai_ethics_score                       0
last_exam_score                       0
assignment_scores_avg                 0
attendance_percentage                 0
concept_understanding_score           0
study_consistency_index               0
improvement_rate                      0
sleep_hours                           0
social_media_hours                    0
tutoring_hours                        0
class_participation_score             0
final_score                           0
pass

In [7]:
dataset.describe()

Unnamed: 0,student_id,age,study_hours_per_day,uses_ai,ai_usage_time_minutes,ai_dependency_score,ai_generated_content_percentage,ai_prompts_per_week,ai_ethics_score,last_exam_score,...,attendance_percentage,concept_understanding_score,study_consistency_index,improvement_rate,sleep_hours,social_media_hours,tutoring_hours,class_participation_score,final_score,passed
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,4000.5,18.96725,3.286938,0.641,88.989875,5.515125,50.0575,59.3595,5.472375,59.50125,...,69.852112,5.48475,5.51645,10.127325,6.465025,3.0106,2.523387,5.463375,56.811988,0.88925
std,2309.54541,3.15316,1.582171,0.479737,52.184077,2.883064,29.440109,34.869496,2.872067,23.161888,...,17.234339,2.898546,2.586377,17.479288,1.43567,1.744974,1.436991,2.881917,13.455724,0.313842
min,1.0,14.0,0.5,0.0,0.0,1.0,0.0,0.0,1.0,20.0,...,40.0,1.0,1.0,-20.0,4.0,0.0,0.0,1.0,12.7,0.0
25%,2000.75,16.0,1.9,0.0,43.0,3.0,25.0,29.0,3.0,39.75,...,54.8,3.0,3.2,-5.0,5.2,1.5,1.3,3.0,47.3,1.0
50%,4000.5,19.0,3.3,1.0,89.0,6.0,50.0,59.0,6.0,59.0,...,70.0,5.0,5.5,10.15,6.4,3.0,2.5,5.0,56.9,1.0
75%,6000.25,22.0,4.6,1.0,134.0,8.0,76.0,90.0,8.0,80.0,...,84.6,8.0,7.8,25.5,7.7,4.5,3.8,8.0,66.2,1.0
max,8000.0,24.0,6.0,1.0,179.0,10.0,100.0,119.0,10.0,99.0,...,100.0,10.0,10.0,40.0,9.0,6.0,5.0,10.0,95.8,1.0


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   student_id                       8000 non-null   int64  
 1   age                              8000 non-null   int64  
 2   gender                           8000 non-null   object 
 3   grade_level                      8000 non-null   object 
 4   study_hours_per_day              8000 non-null   float64
 5   uses_ai                          8000 non-null   int64  
 6   ai_usage_time_minutes            8000 non-null   int64  
 7   ai_tools_used                    6638 non-null   object 
 8   ai_usage_purpose                 6654 non-null   object 
 9   ai_dependency_score              8000 non-null   int64  
 10  ai_generated_content_percentage  8000 non-null   int64  
 11  ai_prompts_per_week              8000 non-null   int64  
 12  ai_ethics_score     

In [9]:
for col in dataset.columns[1:]:
    print(f"Unique values in column '{col}':")
    print(dataset[col].unique())
    print("\n")

Unique values in column 'age':
[20 17 24 21 18 23 16 19 15 14 22]


Unique values in column 'gender':
['Female' 'Male' 'Other']


Unique values in column 'grade_level':
['1st Year' '12th' '3rd Year' '10th' '11th' '2nd Year']


Unique values in column 'study_hours_per_day':
[2.5 3.4 0.8 4.4 3.5 5.1 2.  5.5 6.  5.6 4.8 5.  3.7 1.5 4.2 5.9 4.3 2.3
 1.4 0.6 2.4 5.3 0.7 2.9 4.7 5.7 2.8 4.6 4.  1.9 3.3 1.2 3.9 1.6 5.8 2.1
 3.2 4.5 3.6 1.3 5.2 0.9 4.9 2.7 3.  5.4 4.1 1.8 3.8 2.2 1.7 2.6 0.5 3.1
 1.  1.1]


Unique values in column 'uses_ai':
[1 0]


Unique values in column 'ai_usage_time_minutes':
[170 123  35  45  21  34 112 174  32 140  55  66  12  49  27 103  97  95
  86  87 151 124  80   6  73  82  23 136 113  14 121  85 144  18   8  63
  37 135 133 176 101 172 148  92  33 139  38   5  31 165 146 179 155  61
 175 166  25   0 162  76 102  75  81  68 158 105 142 119  47  67  84 125
 167 120 128  19 149  64  60  78  13  48 168 115  89  88 178 116  53   3
 104  65 137  50  29 171 159  62  11  

The column ***uses_ai*** does not help in classifying the performance as whether this field is yes or no, the students always have a ***ai_usage_time_minutes*** present which is always greater than 0. This later field could be used to determine how much time each student spends on AI.

## Data Pre-Processing

Preprocessing data:
1.  Handle missing values
2.  Encode categorical variables
3.  Split into train/test sets
4.  Feature scaling



### Handling Missing Values

In [10]:
# Data pre-processing

# 1. Separate features (X) and target (y)
X, y = dataset.iloc[:, :-1].values, dataset.iloc[:, -1].values

# 2. Handle Missing Values
imputer_num = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

numerical_columns = [1, 4, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
categorical_columns = [2, 3, 7, 8]
missing_values_columns = [7, 8]

for col in missing_values_columns:
    imputer_cat.fit(X[:, col].reshape(-1, 1))
    X[:, col] = imputer_cat.transform(X[:, col].reshape(-1, 1)).flatten()

print("Matrix of features: ", X[:10, 7:9])

# 3. Encode categorical variables
categorical_binary_cols = [2]
categorical_non_binary_cols = [3, 7, 8]

print("Target before label encoding: ", y[:10])

oe = OrdinalEncoder()
y = oe.fit_transform(y.reshape(-1, 1)).flatten()

print("Target after label encoding: ", y[:10])

Matrix of features:  [['ChatGPT+Gemini' 'Exam Prep']
 ['ChatGPT+Gemini' 'Notes']
 ['Copilot' 'Doubt Solving']
 ['ChatGPT+Gemini' 'Notes']
 ['ChatGPT+Gemini' 'Coding']
 ['Copilot' 'Coding']
 ['ChatGPT+Gemini' 'Exam Prep']
 ['ChatGPT' 'Homework']
 ['Copilot' 'Exam Prep']
 ['ChatGPT+Gemini' 'Doubt Solving']]
Target before label encoding:  ['Low' 'Medium' 'Medium' 'Medium' 'Low' 'High' 'High' 'Medium' 'High'
 'Medium']
Target after label encoding:  [1. 2. 2. 2. 1. 0. 0. 2. 0. 2.]


## Visualizing Relationships between Dataset Features & Target

In [11]:
dataset_columns = dataset.columns

print(dataset_columns)

Index(['student_id', 'age', 'gender', 'grade_level', 'study_hours_per_day',
       'uses_ai', 'ai_usage_time_minutes', 'ai_tools_used', 'ai_usage_purpose',
       'ai_dependency_score', 'ai_generated_content_percentage',
       'ai_prompts_per_week', 'ai_ethics_score', 'last_exam_score',
       'assignment_scores_avg', 'attendance_percentage',
       'concept_understanding_score', 'study_consistency_index',
       'improvement_rate', 'sleep_hours', 'social_media_hours',
       'tutoring_hours', 'class_participation_score', 'final_score', 'passed',
       'performance_category'],
      dtype='object')


In [12]:

# ===============================
# ENCODING CATEGORICAL FEATURES
# ===============================

# Identifying column indices based on previous inspection
# categorical_columns = [2, 3, 7, 8] -> gender, grade_level, ai_tools_used, ai_usage_purpose

ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'), [2, 3, 7, 8])
    ],
    remainder='passthrough'
)

X_encoded = ct.fit_transform(X)

# Split the encoded data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# ===============================
# SCALING
# ===============================

scaler = StandardScaler(with_mean=False) # with_mean=False because OneHot creates sparse-like data

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ===============================
# CREATE MODEL DIRECTORY
# ===============================

os.makedirs("models", exist_ok=True)

# ===============================
# DEFINE MODELS
# ===============================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# ===============================
# TRAIN AND EVALUATE
# ===============================

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Use scaled data for all for consistency, or specific ones as before
    if name in ["Logistic Regression", "KNN", "Naive Bayes"]:
        # Naive Bayes (Gaussian) needs dense input; OneHot output can be sparse
        X_tr = X_train_scaled.toarray() if hasattr(X_train_scaled, "toarray") else X_train_scaled
        X_te = X_test_scaled.toarray() if hasattr(X_test_scaled, "toarray") else X_test_scaled
        model.fit(X_tr, y_train)
        y_pred = model.predict(X_te)
        y_prob = model.predict_proba(X_te)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([name, accuracy, precision, recall, f1, mcc])

    print(f"Accuracy: {accuracy:.4f}")

    # SAVE MODEL (THIS WAS MISSING)
    filename = name.replace(" ", "_").lower() + ".pkl"
    joblib.dump(model, f"models/{filename}")

# ===============================
# RESULTS TABLE
# ===============================

results_df = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "MCC Score"]
)

print("\nFinal Comparison Table:")
display(results_df)

# Save results
results_df.to_csv("models/model_results.csv", index=False)
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(ct, "models/column_transformer.pkl")


Training Logistic Regression...
Accuracy: 0.9881

Training Decision Tree...
Accuracy: 1.0000

Training KNN...
Accuracy: 0.7000

Training Naive Bayes...
Accuracy: 0.7462

Training Random Forest...
Accuracy: 1.0000

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 1.0000

Final Comparison Table:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,MCC Score
0,Logistic Regression,0.988125,0.988219,0.988125,0.98804,0.978801
1,Decision Tree,1.0,1.0,1.0,1.0,1.0
2,KNN,0.7,0.706238,0.7,0.675577,0.432966
3,Naive Bayes,0.74625,0.816317,0.74625,0.708083,0.576329
4,Random Forest,1.0,1.0,1.0,1.0,1.0
5,XGBoost,1.0,1.0,1.0,1.0,1.0


['models/column_transformer.pkl']