In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
)
data = pd.read_csv('Predict_Academic_Success_Modified.csv')

In [2]:
# convert the target using Label Encoding ('Dropout', 'Enrolled', 'Graduate')
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"]) 

In [3]:
# 90% for training/testing, 10% for unseen validation
train_test_data, unseen_data = train_test_split(data, test_size=0.10, random_state=42, stratify=data["Target"])

In [4]:
# For training data
# Get the target and feature of the training/testing data 
X = train_test_data.drop(columns=["Target"]) # Features
y = train_test_data["Target"] # Target

In [5]:
# encode categorical features
categorical_cols = ["Previous qualification", "Mother's qualification", "Father's qualification", "Nacionality",
                    "Mother's occupation", "Father's occupation", "Marital status", "Application mode",
                    "Course", "Daytime/evening attendance	", "Displaced", "Educational special needs", 
                    "Debtor", "Tuition fees up to date", "Gender", "Scholarship holder", "International"]

for col in categorical_cols:
    X[col] = LabelEncoder().fit_transform(X[col])  

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [6]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Train the model 80% training and 20% testing
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [8]:
#apply 10 fold cross validation testing
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn, X_train, y_train, cv=kf, scoring="accuracy")
# Make a prediction
y_pred = knn.predict(X_test)

In [9]:
# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, knn.predict_proba(X_test), multi_class="ovr")
conf_matrix = confusion_matrix(y_test, y_pred)
# Disctionary for displatyng the result in tabular form
testing_metrics = {
    "Metric": [
        "Cross-Validation Accuracy",
        "Accuracy",
        "Precision",
        "Recall",
        "ROC-AUC Score"
    ],
    "Value": [
        np.mean(cv_scores),
        accuracy,
        precision,
        recall,
        roc_auc
    ]
}
df_testing_metrics = pd.DataFrame(testing_metrics)


# Display Confusion Matrix of testing data
print("\nConfusion Matrix (Testing Data):\n", conf_matrix)
# Display the results for testing data
print("\nTesting Data Evaluation Results\n")
df_testing_metrics


Confusion Matrix (Testing Data):
 [[193  31  49]
 [ 31  44  59]
 [ 38  30 322]]

Testing Data Evaluation Results



Unnamed: 0,Metric,Value
0,Cross-Validation Accuracy,0.669602
1,Accuracy,0.70138
2,Precision,0.689212
3,Recall,0.70138
4,ROC-AUC Score,0.786714


In [10]:
#For unseen data
# Get the target and feature of the training/testing data 
X_unseen = unseen_data.drop(columns=["Target"]) # Features
y_unseen = unseen_data["Target"] # Target

In [11]:
# encode categorical features
for col in categorical_cols:
    X_unseen[col] = LabelEncoder().fit_transform(X_unseen[col])  

X_unseen = pd.DataFrame(scaler.fit_transform(X_unseen), columns=X_unseen.columns)

In [12]:
#apply 10 fold cross validation
unseen_cv_scores = cross_val_score(knn, X_unseen, y_unseen, cv=kf, scoring="accuracy")
# Make a prediction
y_unseen_pred = knn.predict(X_unseen)

In [13]:
# Evaluation Metrics
unseen_accuracy = accuracy_score(y_unseen, y_unseen_pred)
unseen_precision = precision_score(y_unseen, y_unseen_pred, average='weighted')
unseen_recall = recall_score(y_unseen, y_unseen_pred, average='weighted')
unseen_roc_auc = roc_auc_score(y_unseen, knn.predict_proba(X_unseen), multi_class="ovr")
unseen_conf_matrix = confusion_matrix(y_unseen, y_unseen_pred)

# Disctionary for displatyng the result in tabular form
unseen_metrics = {
    "Metric": [
        "Cross-Validation Accuracy",
        "Accuracy",
        "Precision",
        "Recall",
        "ROC-AUC Score"
    ],
    "Value": [
        np.mean(unseen_cv_scores),
        unseen_accuracy,
        unseen_precision,
        unseen_recall,
        unseen_roc_auc
    ]
}
df_unseen_metrics = pd.DataFrame(unseen_metrics)

# Display Confusion Matrix  of unseen data
print("\nConfusion Matrix (Unseen Data):\n", unseen_conf_matrix)
# Display the results for unseen data
print("\nUnseen Data Evaluation Results\n")
df_unseen_metrics


Confusion Matrix (Unseen Data):
 [[ 94  22  26]
 [ 16  24  40]
 [ 15  11 195]]

Unseen Data Evaluation Results



Unnamed: 0,Metric,Value
0,Cross-Validation Accuracy,0.66803
1,Accuracy,0.706546
2,Precision,0.689804
3,Recall,0.706546
4,ROC-AUC Score,0.785668
