In [1]:
# Data link: https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success
# Citation: Realinho, V., Vieira Martins, M., Machado, J., & Baptista, L. (2021). Predict Students' Dropout and Academic Success [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5MC89.

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

# To launch tensorboard, run this in CMD: tensorboard --logdir=runs --port=6006
from torch.utils.tensorboard import SummaryWriter

import pandas as pd

import numpy as np

from sklearn.metrics import (
    precision_score,
    recall_score,
    accuracy_score,
    confusion_matrix,
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.dummy import DummyClassifier

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from ucimlrepo import fetch_ucirepo

fetched_data = fetch_ucirepo(name="Predict Students' Dropout and Academic Success")

In [3]:
fetched_data.metadata.additional_info

{'summary': None,
 'purpose': 'The dataset was created in a project that aims to contribute to the reduction of academic dropout and failure in higher education, by using machine learning techniques to identify students at risk at an early stage of their academic path, so that strategies to support them can be put into place. \n\nThe dataset includes information known at the time of student enrollment – academic path, demographics, and social-economic factors. \n\nThe problem is formulated as a three category classification task (dropout, enrolled, and graduate) at the end of the normal duration of the course. \n',
 'funded_by': 'This dataset is supported by program SATDAP - Capacitação da Administração Pública under grant POCI-05-5762-FSE-000191, Portugal.',
 'instances_represent': 'Each instance is a student',
 'recommended_data_splits': 'The dataset was used, in our project, with a data split of 80% for training and 20% for test.',
 'sensitive_data': None,
 'preprocessing_descriptio

In [4]:
X = fetched_data.data.features
X

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0,0.000000,0,10.8,1.4,1.74
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,0,6,0,0,0.000000,0,10.8,1.4,1.74
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,0,6,10,5,12.400000,0,9.4,-0.8,-3.12
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,0,6,6,6,13.000000,0,13.9,-0.3,0.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,0,6,8,5,12.666667,0,15.5,2.8,-4.06
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,0,6,6,2,11.000000,0,11.1,0.6,2.02
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,0,8,9,1,13.500000,0,13.9,-0.3,0.79
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,0,5,6,5,12.000000,0,9.4,-0.8,-3.12


In [5]:
y = fetched_data.data.targets
y

Unnamed: 0,Target
0,Dropout
1,Graduate
2,Dropout
3,Graduate
4,Graduate
...,...
4419,Graduate
4420,Dropout
4421,Dropout
4422,Graduate


In [6]:
# Combine into one dataset
df = X
# Get numerical values for target
df["Target"] = np.where(
    y["Target"] == "Dropout", 0, np.where(y["Target"] == "Enrolled", 1, 2)
)
df

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,2
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,2
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,2
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,0
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,0
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,2


In [7]:
# Define numerical columns which will not be embedded
numerical_columns = [
    "Application order",
    "Previous qualification (grade)",
    "Admission grade",
    "Age at enrollment",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
    "Unemployment rate",
    "Inflation rate",
    "GDP",
]
# Define categorical columns which will be embedded
categorical_columns = [
    "Marital Status",
    "Application mode",
    "Course",
    "Daytime/evening attendance",
    "Previous qualification",
    "Nacionality",
    "International",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    "Displaced",
    "Educational special needs",
    "Debtor",
    "Tuition fees up to date",
    "Gender",
    "Scholarship holder",
]

In [8]:
# Get training numerical set
train_numerical = df[numerical_columns].sample(frac=0.8, random_state=0)
# Put rest in testing set
test_numerical = df[numerical_columns].drop(train_numerical.index)
# Pull in mean and std from training, assume won't have visibility into testing
continuous_means = train_numerical[numerical_columns].mean()
continuous_stds = train_numerical[numerical_columns].std()
# Normalize numerical columns
train_continuous = (train_numerical - continuous_means) / continuous_stds
test_continuous = (test_numerical - continuous_means) / continuous_stds
# Make a tensor of the values
X_train_continuous_tensor = torch.tensor(train_continuous.values, dtype=torch.float32)
X_test_continuous_tensor = torch.tensor(test_continuous.values, dtype=torch.float32)

In [9]:
# Process categorical columns for embedding - note this happens on full dataset to avoid embed errors if something exists in Test but not Train
df_categorical = df[categorical_columns].copy()
# Define mapping and categorical numbers
category_maps = {}
num_categories = {}
for col in categorical_columns:
    # Factorize the categories to integer indices
    df_categorical[col], uniques = pd.factorize(df_categorical[col])
    category_maps[col] = uniques  # Save the mapping for later use
    num_categories[col] = len(uniques)
# Get train and test data from categorical
X_train_categorical = df_categorical.sample(frac=0.8, random_state=0)
X_test_categorical = df_categorical.drop(X_train_categorical.index)
# Get train and test y values
y_train_categorical = df["Target"].sample(frac=0.8, random_state=0)
y_test_categorical = df["Target"].drop(y_train_categorical.index)
# Make a tensor of the values
X_train_categorical_tensor = torch.tensor(X_train_categorical.values, dtype=torch.long)
y_train_categorical_tensor = torch.tensor(y_train_categorical.values, dtype=torch.long)
X_test_categorical_tensor = torch.tensor(X_test_categorical.values, dtype=torch.long)
y_test_categorical_tensor = torch.tensor(y_test_categorical.values, dtype=torch.long)


# Make a custom class for processing the categoricals so they get embedded properly
class CustomDataset(Dataset):
    def __init__(self, continuous_data, categorical_data, targets):
        self.continuous_data = continuous_data
        self.categorical_data = categorical_data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        x_cont = self.continuous_data[idx]
        x_cat = self.categorical_data[idx]
        y = self.targets[idx]
        return x_cont, x_cat, y


# Make the datasets
train_dataset = CustomDataset(
    X_train_continuous_tensor, X_train_categorical_tensor, y_train_categorical_tensor
)
test_dataset = CustomDataset(
    X_test_continuous_tensor, X_test_categorical_tensor, y_test_categorical_tensor
)
# Make loaders from the dataset
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [10]:
class FeedforwardModel(nn.Module):
    def __init__(
        self,
        num_continuous,
        num_categories_list,
        embedding_dims,
        output_size,
        dropout=0.5,
    ):
        super().__init__()
        # Embedding layers for categorical variables
        self.embeddings = nn.ModuleList(
            [
                nn.Embedding(num_categories, emb_dim)
                for num_categories, emb_dim in zip(num_categories_list, embedding_dims)
            ]
        )
        total_embedding_dim = sum(embedding_dims)
        # Fully connected layers
        self.fc1 = nn.Linear(num_continuous + total_embedding_dim, 64)
        self.fc2 = nn.Linear(64, output_size)
        # Activation
        self.act = nn.ReLU()
        # Dropout
        self.dropout = nn.Dropout(dropout)
        # Probabilistic outputting with sigmoid
        self.sigmoid = nn.Sigmoid()

    def forward(self, x_cont, x_cat):
        # Pass categorical data through embeddings
        x_embeds = [
            emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)
        ]
        x_cat_embed = torch.cat(x_embeds, dim=1)
        # Concatenate continuous and embedded categorical data
        x = torch.cat([x_cont, x_cat_embed], dim=1)
        # Forward pass through fully connected layers
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [11]:
def test_model(model, dataloader, device, criterion):
    """Evaluate model on test data and print metrics"""
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []
    # Disable gradient calculations
    with torch.no_grad():
        loss_count = 0
        loss_total = 0
        # Loop through the dataset
        for x_cont, x_cat, y in dataloader:
            # Put data on device
            x_cont = x_cont.to(device)
            x_cat = x_cat.to(device)
            y = y.to(device)

            # Forward pass
            out = model(x_cont, x_cat)
            loss = criterion(out, y)

            # Convert predictions to their most likely class
            preds = torch.argmax(out, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y.cpu().numpy())
            loss_count += 1
            loss_total += loss.item()

    # Calculate metrics
    # How many are right
    accuracy = accuracy_score(all_labels, all_preds)
    # Loss from the loss function
    loss_avg = loss_total / loss_count

    return accuracy, loss_avg

In [12]:
def train_loop(
    epochs,
    model,
    criterion,
    optimizer,
    scheduler,
    train_dataloader,
    device,
    test_dataloader,
):
    """Run training loop and store test diagnostics"""
    # Define the tensorboard logging, with setup to remove data on step 0, clearing old runs
    train_writer = SummaryWriter(log_dir="./runs/train", purge_step=0)
    test_writer = SummaryWriter(log_dir="./runs/test", purge_step=0)
    # Put model on device
    model.to(device)
    # For number of epochs...
    for epoch in range(epochs):
        # Set model to train
        model.train()
        # Loop through training dataset
        for x_cont, x_cat, y in train_dataloader:
            # Put data on device
            x_cont = x_cont.to(device)
            x_cat = x_cat.to(device)
            y = y.to(device)
            # Clear gradient
            optimizer.zero_grad()
            # Forward pass
            out = model(x_cont, x_cat)
            # Calculate loss
            loss = criterion(out, y)
            # Pass loss backward
            loss.backward()
            # # Add gradient clipping
            # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            # Step the optimizer
            optimizer.step()
        # Log learning rate to see how that moves over time
        current_lr = scheduler.get_last_lr()[0]
        train_writer.add_scalar("Learning Rate", current_lr, epoch)
        # Training metrics and logging
        train_accuracy, train_loss_avg = test_model(
            model, train_dataloader, device, criterion
        )
        # Step scheduler with average loss
        scheduler.step(train_loss_avg)
        train_writer.add_scalar("Accuracy", train_accuracy, epoch)
        train_writer.add_scalar("Loss", train_loss_avg, epoch)
        # Testing metrics and logging
        test_accuracy, test_loss_avg = test_model(
            model, test_dataloader, device, criterion
        )
        test_writer.add_scalar("Accuracy", test_accuracy, epoch)
        test_writer.add_scalar("Loss", test_loss_avg, epoch)

    return model, test_accuracy

In [13]:
# Determine embedding dimensions
num_categories_list = [num_categories[col] for col in categorical_columns]
embedding_dims = [min(50, 1 + (n + 1) // 10) for n in num_categories_list]

# Initialize the model
num_continuous = len(numerical_columns)
output_size = 3
model = FeedforwardModel(
    num_continuous, num_categories_list, embedding_dims, output_size, 0.75
)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define a scheduler for reducing learning rate over time
scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.25)

model, final_test_accuracy = train_loop(
    1_000,
    model,
    criterion,
    optimizer,
    scheduler,
    train_dataloader,
    DEVICE,
    test_dataloader,
)

# Create a dummy classifier with a 'most_frequent' strategy
dummy_clf = DummyClassifier(strategy="most_frequent")

# Train the dummy
dummy_clf.fit(y_train_categorical, y_train_categorical)

# Evaluate the dummy
dummy_accuracy = dummy_clf.score(y_test_categorical, y_test_categorical)
print(f'Model accuracy: {final_test_accuracy} | Dummy accuracy: {dummy_accuracy}')

Model accuracy: 0.7796610169491526 | Dummy accuracy: 0.4937853107344633
