In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

In [2]:
# Load dataset
data = pd.read_csv('Predict Academic Success - data.csv')

In [3]:
# Convert categorical columns to numerical using Label Encoding
categorical_columns = ["Marital status", "Application mode", "Course", "Daytime/evening attendance\t", 
                       "Previous qualification", "Nacionality", "Mother's qualification", "Father's qualification"]

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save encoder in case we need to transform new data

In [4]:
# Convert the target using Label Encoding ('Dropout', 'Enrolled', 'Graduate')
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])

# Define features (X) and target (y)
X = data.drop(columns=["Target"])  # Exclude target column
y = data["Target"]

# Split dataset
train_test_data, unseen_data = train_test_split(data, test_size=0.10, random_state=42, stratify=data["Target"])
train_data, test_data = train_test_split(train_test_data, test_size=0.20, random_state=42, stratify=train_test_data["Target"])


In [14]:
# Extract features and target
X_train = train_data.drop(columns=["Target"])
y_train = train_data["Target"]
X_test = test_data.drop(columns=["Target"])
y_test = test_data["Target"]

# Initialize Naive Bayes model
nb_model = GaussianNB()

# Apply 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(nb_model, X_train, y_train, cv=cv, scoring='accuracy')

print(f"Cross-validation accuracy scores: \n{cross_val_scores}\n")
print(f"Mean cross-validation accuracy: \n{cross_val_scores.mean()}\n")

# Train the model
nb_model.fit(X_train, y_train)

# Predictions
y_pred = nb_model.predict(X_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, nb_model.predict_proba(X_test), multi_class="ovr")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: \n{accuracy}\n")
print(f"Precision: \n{precision}\n")
print(f"Recall: \n{recall}\n")
print(f"ROC-AUC: \n{roc_auc}\n")
print(f"Confusion Matrix:\n{conf_matrix}")

Cross-validation accuracy scores: 
[0.6677116  0.69278997 0.69592476 0.6677116  0.66352201 0.71383648
 0.67610063 0.62893082 0.67924528 0.72641509]

Mean cross-validation accuracy: 
0.6812188245499892

Accuracy: 
0.6900878293601004

Precision: 
0.6724237127176027

Recall: 
0.6900878293601004

ROC-AUC: 
0.7998736729326316

Confusion Matrix:
[[171  33  52]
 [ 31  40  72]
 [ 31  28 339]]
