# Predicting Student Exam Scores Using a Deep Learning model


In [None]:
import torch
import pandas as pd
import numpy as np
import kaggle as kg
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
#Make sure we are in the right directory
def read_data(test_or_train='train'):
    if 'data' in os.listdir():
        train_df = pd.read_csv(f'data/playground-series-s6e1/{test_or_train}.csv')
    else:
        os.chdir('..')
        train_df = pd.read_csv(f'data/playground-series-s6e1/{test_or_train}.csv')
    return train_df

TARGET_COL = 'exam_score'
ID_COL = 'id'

def create_and_fit_preprocessor(df_train, target_col=TARGET_COL, id_col=ID_COL):
    """
    Creates a preprocessor and teaches it (fits it) using the training data.
    Returns the fitted preprocessor.
    """
    # 1. Identify columns
    # Note: We drop target/ID first so we don't accidentally scale them
    X_train = df_train.drop(columns=[id_col, target_col])
    
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # 2. Define the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            # handle_unknown='ignore' is crucial for test data!
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
        ],
        verbose_feature_names_out=False # Keeps column names clean
    )

    # 3. FIT the preprocessor on training data
    preprocessor.fit(X_train)
    
    return preprocessor

def process_data(df, preprocessor, target_col=TARGET_COL, id_col=ID_COL):
    """
    Uses an EXISTING preprocessor to transform data.
    Does not learn new patterns; simply applies existing rules.
    """
    # Remove ID and Target if they exist (so we only transform features)
    # We use 'errors=ignore' so this works on test data (which might not have a score column)
    drop_cols = [c for c in [id_col, target_col] if c in df.columns]
    X = df.drop(columns=drop_cols, errors='ignore')
    
    # 4. TRANSFORM only (do not fit!)
    X_processed = preprocessor.transform(X)
    
    # Optional: Convert back to DataFrame for readability (preserves column names)
    # This requires scikit-learn v1.0+
    try:
        feature_names = preprocessor.get_feature_names_out()
        return pd.DataFrame(X_processed, columns=feature_names)
    except:
        return pd.DataFrame(X_processed) 

#Dataloader:
def create_dataloader(features, targets=None, batch_size=128, shuffle=True):
    features_tensor = torch.tensor(features, dtype=torch.float32)
    if targets is None:
        dataset = TensorDataset(features_tensor)
    else:
        targets_tensor = torch.tensor(targets, dtype=torch.float32)
        dataset = TensorDataset(features_tensor, targets_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

train_df = read_data('train')
test_df = read_data('test')
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
preprocessor = create_and_fit_preprocessor(train_df)
train_processed = process_data(train_df, preprocessor)
val_processed = process_data(val_df, preprocessor)
test_processed = process_data(test_df, preprocessor)

y_train = train_df[TARGET_COL].astype(np.float32).to_numpy()
y_val = val_df[TARGET_COL].astype(np.float32).to_numpy()

train_dataloader = create_dataloader(train_processed.values, y_train)
val_dataloader = create_dataloader(val_processed.values, y_val, shuffle=False)
test_dataloader = create_dataloader(test_processed.values, shuffle=False)




## Build model:

I decided to use a simple architecture for the model, a MLP with 4 layers.


In [None]:
class NeuralNetworkModel(torch.nn.Module):
    def __init__(self, input_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 64)
        self.fc2 = torch.nn.Linear(64, 128)
        self.fc3 = torch.nn.Linear(128, 64)
        self.fc4 = torch.nn.Linear(64, 32)
        self.fc5 = torch.nn.Linear(32, 1)
        self.relu = torch.nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))

        #Add dropout for generalization
        x = self.dropout(x)
        x = self.fc5(x)
        return x
    
    

### Define Function for training model using tqdm to keep track of progress

In [None]:
# Train model, keep track of loss with tqdm progress bar.
#Save Validation and train losses for each epoch.
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    if torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')


#TODO: Save train and val losses for each epoch.
def train_model(model, dataloader_train, dataloader_val, criterion, optimizer, device, epochs=10):
    history = {'train_loss': [], 'val_loss': []}
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in tqdm(dataloader_train, desc=f"Epoch {epoch+1}/{epochs}"):
            inputs = inputs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(dataloader_train.dataset)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")
        history['train_loss'].append(epoch_loss)
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in dataloader_val:
                inputs = inputs.to(device)
                targets = targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item() * inputs.size(0)
        val_epoch_loss = val_loss / len(dataloader_val.dataset)
        print(f"Validation Loss: {val_epoch_loss:.4f}")
        history['val_loss'].append(val_epoch_loss)
        model.train()
    return history


In [None]:
device = get_device()
input_size = train_processed.shape[1]
model = NeuralNetworkModel(input_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_hist = train_model(model, train_dataloader, val_dataloader, criterion, optimizer, device, epochs=20)

## Plot train and validation loss:

In [None]:
#plot train and validation losses
import matplotlib.pyplot as plt
plt.plot(train_hist['train_loss'], label='Train Loss')
plt.plot(train_hist['val_loss'], label='Validation Loss')   
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:

def test_model(model, test_dataloader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for inputs in test_dataloader:
            inputs = inputs[0].to(device)
            predictions.append(model(inputs).squeeze().cpu().numpy())
    return predictions

test_predictions = test_model(model, test_dataloader, device)
test_predictions = np.concatenate(test_predictions)
#Create submission file
submission_df = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET_COL: test_predictions
})
submission_df.to_csv('nn_submission.csv', index=False)