In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score

data = pd.read_csv("Predict Academic Success - data.csv")

# Convert Target labels to numeric values
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])  # Dropout = 0, Enrolled = 1, Graduate = 2

# Separate features (X) and target (y)
X = data.iloc[:, :-1]  # All columns except Target
y = data["Target"]  # Target column

# Standardize features for better SVM performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into 90% training & testing, 10% unseen
X_train_test, X_unseen, y_train_test, y_unseen = train_test_split(
    X_scaled, y, test_size=0.10, random_state=42, stratify=y)

# Split 90% into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_train_test, y_train_test, test_size=0.20, random_state=42, stratify=y_train_test)

# Train SVM with 10-fold cross-validation
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
y_pred_cv = cross_val_predict(svm_model, X_train, y_train, cv=cv)

In [41]:
# Step 4: Evaluate the model
conf_matrix = confusion_matrix(y_train, y_pred_cv)
accuracy = accuracy_score(y_train, y_pred_cv)
precision = precision_score(y_train, y_pred_cv, average='weighted')
recall = recall_score(y_train, y_pred_cv, average='weighted')

# ROC-AUC requires probability estimates and binary/multi-class consideration
svm_model.fit(X_train, y_train)
y_proba_cv = svm_model.predict_proba(X_train)
roc_auc = roc_auc_score(y_train, y_proba_cv, multi_class='ovr')  # One-vs-Rest for multi-class

# Return results
print("Training Set Results")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC-AUC:", roc_auc)


Training Set Results
[[ 741  100  182]
 [ 113  173  285]
 [  33   51 1506]]
Accuracy: 0.7600502512562815
Precision: 0.745337489521539
Recall: 0.7600502512562815
ROC-AUC: 0.8689557060040175


In [43]:
# Perform cross-validation
y_pred_cv_test = cross_val_predict(svm_model, X_test, y_test, cv=cv)
# Step 4: Evaluate the model
conf_matrix_test = confusion_matrix(y_test, y_pred_cv_test)
accuracy_test = accuracy_score(y_test, y_pred_cv_test)
precision_test = precision_score(y_test, y_pred_cv_test, average='weighted')
recall_test = recall_score(y_test, y_pred_cv_test, average='weighted')

# ROC-AUC requires probability estimates and binary/multi-class consideration
svm_model.fit(X_train, y_train)
y_proba_cv_test = svm_model.predict_proba(X_test)
roc_auc_test = roc_auc_score(y_test, y_proba_cv_test, multi_class='ovr')  # One-vs-Rest for multi-class

# Return results
print("Testing Set Results")
print(conf_matrix_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("ROC-AUC:", roc_auc_test)

Testing Set Results
[[174  29  53]
 [ 31  36  76]
 [ 18  11 369]]
Accuracy: 0.726474278544542
Precision: 0.7056327923298134
Recall: 0.726474278544542
ROC-AUC: 0.8689557060040175


In [44]:
y_unseen_predict = svm_model.predict(X_unseen)
y_unseen_predict_labels = label_encoder.inverse_transform(y_unseen_predict)
pd.DataFrame({"Actual": label_encoder.inverse_transform(y_unseen), "Predicted": y_unseen_predict_labels})

Unnamed: 0,Actual,Predicted
0,Graduate,Graduate
1,Graduate,Graduate
2,Enrolled,Graduate
3,Enrolled,Enrolled
4,Graduate,Graduate
...,...,...
438,Graduate,Graduate
439,Dropout,Dropout
440,Enrolled,Enrolled
441,Dropout,Graduate
