# Predict Students' Dropout and Academic Success

In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
dataset = fetch_ucirepo(id=697)
X = dataset.data.features
y = dataset.data.targets

In [3]:
#Exploring dataset 

print(f"\nFeature data types:")
print(X.dtypes.value_counts())

print(f"\nMissing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in target: {y.isnull().sum().sum()}")

print("\nFirst few rows of features:")
print(X.head())

print("\nFirst few rows of target:")
print(y.head())



Feature data types:
int64      29
float64     7
Name: count, dtype: int64

Missing values in features: 0
Missing values in target: 0

First few rows of features:
   Marital Status  Application mode  Application order  Course  \
0               1                17                  5     171   
1               1                15                  1    9254   
2               1                 1                  5    9070   
3               1                17                  2    9773   
4               2                39                  1    8014   

   Daytime/evening attendance  Previous qualification  \
0                           1                       1   
1                           1                       1   
2                           1                       1   
3                           1                       1   
4                           0                       1   

   Previous qualification (grade)  Nacionality  Mother's qualification  \
0                      

In [4]:
#Encoding Target Variable

le = LabelEncoder()
y_encoded = le.fit_transform(y.iloc[:, 0])

print(f"Original classes: {le.classes_}")
print(f"\nEncoding mapping:")
for i, class_name in enumerate(le.classes_):
    print(f"  {class_name} → {i}")


Original classes: ['Dropout' 'Enrolled' 'Graduate']

Encoding mapping:
  Dropout → 0
  Enrolled → 1
  Graduate → 2


In [5]:
# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

print(f"\nClass distribution in training set:")
for i, class_name in enumerate(le.classes_):
    count = (y_train == i).sum()
    pct = count / len(y_train) * 100
    print(f"  {class_name}: {count} ({pct:.1f}%)")


Training set: 3539 samples
Test set: 885 samples

Class distribution in training set:
  Dropout: 1137 (32.1%)
  Enrolled: 635 (17.9%)
  Graduate: 1767 (49.9%)


In [6]:
# Train Model (Baseline)
model_baseline = RandomForestClassifier(n_estimators=100, random_state=42)
model_baseline.fit(X_train, y_train)

y_pred_baseline = model_baseline.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)

print(f"\nBaseline Accuracy: {accuracy_baseline:.4f}")
print("\nClassification Report (Baseline):")
print(classification_report(y_test, y_pred_baseline, target_names=le.classes_))



Baseline Accuracy: 0.7672

Classification Report (Baseline):
              precision    recall  f1-score   support

     Dropout       0.81      0.75      0.78       284
    Enrolled       0.57      0.37      0.45       159
    Graduate       0.78      0.92      0.85       442

    accuracy                           0.77       885
   macro avg       0.72      0.68      0.69       885
weighted avg       0.75      0.77      0.75       885

