In [32]:
# Install necessary libraries if not already installed
!pip install missingno
!pip install statsmodels
!pip install imbalanced-learn
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn>=1.4.2 (from scikeras)
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, scikeras
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikeras-0.13.0 scikit-learn-1.5.2


In [13]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# Machine learning models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [20]:
# Mount Google Drive if the files are stored there
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
data_path = '/content/drive/My Drive/ML A1/student+performance/student-por.csv'  # Update this path accordingly
data = pd.read_csv(data_path, sep=';')  # The separator is ';' in these files

# Display the first few rows
print("First 5 rows of the dataset:")
data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
First 5 rows of the dataset:


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [21]:
# Get basic information about the dataset
print("\nDataset Info:")
data.info()



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object


In [22]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64


In [23]:
# Descriptive statistics
print("\nDescriptive Statistics:")
data.describe()



Descriptive Statistics:


Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


In [24]:
# Define pass/fail threshold
passing_grade = 10  # Assuming 10 is the passing grade out of 20

# Create a new target variable (final grade)
data['pass'] = data['G3'] >= passing_grade

# Map to integer labels
data['pass'] = data['pass'].astype(int)

# Drop original grade columns if not needed
data = data.drop(['G1', 'G2', 'G3'], axis=1)

# Check class distribution
print("\nClass Distribution:")
print(data['pass'].value_counts())



Class Distribution:
pass
1    549
0    100
Name: count, dtype: int64


Data preprocessing

In [25]:
# Identify categorical and numerical features
categorical_features = data.select_dtypes(include=['object']).columns.tolist()
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features.remove('pass')  # Exclude the target variable

print("\nCategorical Features:")
print(categorical_features)

print("\nNumerical Features:")
print(numerical_features)



Categorical Features:
['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

Numerical Features:
['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


In [26]:
# One-Hot Encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)

print("\nData after encoding:")
print(data_encoded.head())

# Update the list of features
all_features = data_encoded.columns.tolist()
all_features.remove('pass')  # Exclude the target variable



Data after encoding:
   age  Medu  Fedu  traveltime  studytime  failures  famrel  freetime  goout  \
0   18     4     4           2          2         0       4         3      4   
1   17     1     1           1          2         0       5         3      3   
2   15     1     1           1          2         0       4         3      2   
3   15     4     2           1          3         0       3         2      2   
4   16     3     3           1          2         0       4         3      2   

   Dalc  ...  guardian_mother  guardian_other  schoolsup_yes  famsup_yes  \
0     1  ...             True           False           True       False   
1     1  ...            False           False          False        True   
2     2  ...             True           False           True       False   
3     1  ...             True           False          False        True   
4     1  ...            False           False          False        True   

   paid_yes  activities_yes  nursery_yes

In [27]:
# Feature Scaling
scaler = StandardScaler()
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])

# Check class distribution
print("\nClass Distribution After Encoding:")
print(data_encoded['pass'].value_counts())



Class Distribution After Encoding:
pass
1    549
0    100
Name: count, dtype: int64


In [28]:
# Separate features and target variable
X = data_encoded.drop('pass', axis=1).values
y = data_encoded['pass']

print("\nShape of Features:", X.shape)
print("Shape of Target:", y.shape)


Shape of Features: (649, 39)
Shape of Target: (649,)


In [29]:
# Define the evaluation function
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model_with_smote_metrics(model, X, y):
    # Define the pipeline with SMOTE
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    # Define stratified k-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Lists to store metrics
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    auc_roc_scores = []

    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Fit the model
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred = pipeline.predict(X_test)

        # Predict probabilities for AUC-ROC
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
        auc_roc_scores.append(roc_auc_score(y_test, y_pred_proba))

    # Print results
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f}")
    print(f"Mean Precision: {np.mean(precision_scores):.4f}")
    print(f"Mean Recall: {np.mean(recall_scores):.4f}")
    print(f"Mean F1-score: {np.mean(f1_scores):.4f}")
    print(f"Mean AUC-ROC: {np.mean(auc_roc_scores):.4f}")

Neural network

In [33]:
from scikeras.wrappers import KerasClassifier

def create_nn_model():
    model = Sequential()
    model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [34]:
# Wrap the model using KerasClassifier
nn_model = KerasClassifier(build_fn=create_nn_model, epochs=50, batch_size=32, verbose=0)

print("\nEvaluating Neural Network with SMOTE:")
evaluate_model_with_smote_metrics(nn_model, X, y)

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7c78039c01f0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 1175, in _make_controller_from_path
    lib_controller = controller_class(
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
  File "/usr/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: dlopen() error



Evaluating Neural Network with SMOTE:


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Mean Accuracy: 0.8244
Mean Precision: 0.8881
Mean Recall: 0.9072
Mean F1-score: 0.8971
Mean AUC-ROC: 0.7666


SVM

In [35]:
# SVM with Linear Kernel
print("\nEvaluating SVM with Linear Kernel and SMOTE:")
svm_linear = SVC(kernel='linear', probability=True, random_state=42)
evaluate_model_with_smote_metrics(svm_linear, X, y)


Evaluating SVM with Linear Kernel and SMOTE:
Mean Accuracy: 0.7627
Mean Precision: 0.9281
Mean Recall: 0.7815
Mean F1-score: 0.8477
Mean AUC-ROC: 0.7813


In [36]:
# SVM with RBF Kernel
print("\nEvaluating SVM with RBF Kernel and SMOTE:")
svm_rbf = SVC(kernel='rbf', probability=True, random_state=42)
evaluate_model_with_smote_metrics(svm_rbf, X, y)


Evaluating SVM with RBF Kernel and SMOTE:
Mean Accuracy: 0.8320
Mean Precision: 0.8975
Mean Recall: 0.9053
Mean F1-score: 0.9012
Mean AUC-ROC: 0.7817


k-NN

In [37]:
k_values = [3, 5, 7, 9]
for k in k_values:
    print(f"\nEvaluating k-NN with k={k} and SMOTE:")
    knn = KNeighborsClassifier(n_neighbors=k)
    evaluate_model_with_smote_metrics(knn, X, y)


Evaluating k-NN with k=3 and SMOTE:
Mean Accuracy: 0.6811
Mean Precision: 0.9016
Mean Recall: 0.6995
Mean F1-score: 0.7876
Mean AUC-ROC: 0.6636

Evaluating k-NN with k=5 and SMOTE:
Mean Accuracy: 0.6657
Mean Precision: 0.9132
Mean Recall: 0.6685
Mean F1-score: 0.7718
Mean AUC-ROC: 0.6890

Evaluating k-NN with k=7 and SMOTE:
Mean Accuracy: 0.6625
Mean Precision: 0.9208
Mean Recall: 0.6575
Mean F1-score: 0.7671
Mean AUC-ROC: 0.7079

Evaluating k-NN with k=9 and SMOTE:
Mean Accuracy: 0.6471
Mean Precision: 0.9180
Mean Recall: 0.6393
Mean F1-score: 0.7530
Mean AUC-ROC: 0.7099


In [38]:
# AdaBoost with Decision Trees
print("\nEvaluating AdaBoost with Decision Trees and SMOTE:")
weak_learner = DecisionTreeClassifier(max_depth=1, random_state=42)
ada_boost = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=50, random_state=42)
evaluate_model_with_smote_metrics(ada_boost, X, y)


Evaluating AdaBoost with Decision Trees and SMOTE:


  def __len__(self):
  def __len__(self):
  def __len__(self):
  def __len__(self):
  def __len__(self):


Mean Accuracy: 0.8243
Mean Precision: 0.8866
Mean Recall: 0.9089
Mean F1-score: 0.8976
Mean AUC-ROC: 0.7330
