In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix


# Load dataset
data = pd.read_csv('Predict_Academic_Success_Modified.csv')

# Convert categorical columns to numerical using Label Encoding
categorical_columns = ["Previous qualification", "Mother's qualification", "Father's qualification", "Nacionality",
                    "Mother's occupation", "Father's occupation", "Marital status", "Application mode",
                    "Course", "Daytime/evening attendance", "Displaced", "Educational special needs", 
                    "Debtor", "Tuition fees up to date", "Gender", "Scholarship holder", "International"]
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save encoder in case we need to transform new data

In [None]:
# Convert the target using Label Encoding ('Dropout', 'Enrolled', 'Graduate')
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])

In [3]:
# 90% for training/testing, 10% for unseen validation
train_test_data, unseen_data = train_test_split(data, test_size=0.10, random_state=42, stratify=data["Target"])

In [4]:
# Split the 90% into 80% training and 20% testing
train_data, test_data = train_test_split(
    train_test_data, test_size=0.20, random_state=42, stratify=train_test_data["Target"]
)

In [5]:
# Separate features and target variables
X_train = train_data.drop(columns=["Target"])
y_train = train_data["Target"]
X_test = test_data.drop(columns=["Target"])
y_test = test_data["Target"]

In [6]:
# Initialize Naive Bayes model
nb_model = GaussianNB()

In [7]:
# Apply 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(nb_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation accuracy scores: \n{cross_val_scores}\n")
print(f"Mean cross-validation accuracy: \n{cross_val_scores.mean()}\n")

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions
y_pred = nb_model.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, nb_model.predict_proba(X_test), multi_class="ovr")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame for the evaluation metrics
results_df = pd.DataFrame({
    "Metric": ["Accuracy:", "Precision:", "Recall:", "ROC-AUC:"],
    "Score": [accuracy, precision, recall, roc_auc]
})

# Print the results in tabular form
print("\nEvaluation Metrics:")
print(results_df.to_string(index=False))

# Get class labels from the LabelEncoder
class_labels = label_encoder.classes_

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)

print("\nConfusion Matrix:\n")
print(conf_matrix_df)

Cross-validation accuracy scores: 
[0.70532915 0.66144201 0.69592476 0.68338558 0.72641509 0.76100629
 0.69496855 0.64779874 0.68553459 0.73899371]

Mean cross-validation accuracy: 
0.7000798485834269


Evaluation Metrics:
    Metric    Score
 Accuracy: 0.691343
Precision: 0.678075
   Recall: 0.691343
  ROC-AUC: 0.793278

Confusion Matrix:

          Dropout  Enrolled  Graduate
Dropout       169        42        45
Enrolled       29        46        68
Graduate       36        26       336


In [8]:
# Load the unseen 10% dataset
X_unseen = unseen_data.drop(columns=["Target"])
y_unseen = unseen_data["Target"]

In [9]:
# Use the trained model to predict outcomes for the unseen data
y_unseen_pred = nb_model.predict(X_unseen)

In [31]:
# Accuracy
unseen_accuracy = accuracy_score(y_unseen, y_unseen_pred)

# Precision
unseen_precision = precision_score(y_unseen, y_unseen_pred, average='weighted')

# Recall
unseen_recall = recall_score(y_unseen, y_unseen_pred, average='weighted')

# ROC-AUC Score
unseen_roc_auc = roc_auc_score(y_unseen, nb_model.predict_proba(X_unseen), multi_class="ovr")

# Confusion Matrix
unseen_conf_matrix = confusion_matrix(y_unseen, y_unseen_pred)

# Create evaluation metrics DataFrame
unseen_results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "ROC-AUC"],
    "Score": [unseen_accuracy, unseen_precision, unseen_recall, unseen_roc_auc]
})

print("\nUnseen Data Evaluation Metrics:")
print(unseen_results_df.to_string(index=False))

# Display Confusion Matrix
unseen_conf_matrix_df = pd.DataFrame(unseen_conf_matrix, 
                                     index=label_encoder.classes_, 
                                     columns=label_encoder.classes_)

print("\nConfusion Matrix for Unseen Data:\n")
print(unseen_conf_matrix_df)

# Calculate matches (diagonal values)
matches = np.diag(unseen_conf_matrix)

# Calculate mismatches (row sum - diagonal)
mismatches = unseen_conf_matrix.sum(axis=1) - matches

# Create reformatted confusion matrix DataFrame
unseen_conf_summary_df = pd.DataFrame({
    "Category": label_encoder.classes_,
    "Matches": matches,
    "Mismatches": mismatches
})

print("\nEvaluation Matrix for Unseen Data:\n")
print(unseen_conf_summary_df.to_string(index=False))


Unseen Data Evaluation Metrics:
   Metric    Score
 Accuracy 0.683973
Precision 0.689478
   Recall 0.683973
  ROC-AUC 0.829791

Confusion Matrix for Unseen Data:

          Dropout  Enrolled  Graduate
Dropout        94        33        15
Enrolled       13        27        40
Graduate       14        25       182

Evaluation Matrix for Unseen Data:

Category  Matches  Mismatches
 Dropout       94          48
Enrolled       27          53
Graduate      182          39
