In [42]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [43]:
# Load the dataset
import kagglehub
path = kagglehub.dataset_download("jayaantanaath/student-habits-vs-academic-performance")
df = pd.read_csv(path + '/student_habits_performance.csv')

# Separate features and target
df = df.drop(columns=['student_id'])
X = df.drop(columns=['exam_score'])
y = df['exam_score'].values

# Identify column types
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   gender                         1000 non-null   object 
 2   study_hours_per_day            1000 non-null   float64
 3   social_media_hours             1000 non-null   float64
 4   netflix_hours                  1000 non-null   float64
 5   part_time_job                  1000 non-null   object 
 6   attendance_percentage          1000 non-null   float64
 7   sleep_hours                    1000 non-null   float64
 8   diet_quality                   1000 non-null   object 
 9   exercise_frequency             1000 non-null   int64  
 10  parental_education_level       909 non-null    object 
 11  internet_quality               1000 non-null   object 
 12  mental_health_rating           1000 non-null   in

In [44]:
# Build preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])

# Apply preprocessing
X_train_val_processed = preprocessor.fit_transform(X_train_val)
X_test_processed = preprocessor.transform(X_test)

# Save preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

# Convert to tensors
X_train_val_tensor = torch.tensor(X_train_val_processed, dtype=torch.float32)
y_train_val_tensor = torch.tensor(y_train_val.reshape(-1, 1), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)

print("Processed X_train_val shape:", X_train_val_tensor.shape)
print("Processed X_test shape:", X_test_tensor.shape)
for col in ['gender', 'part_time_job', 'diet_quality', 'parental_education_level', 'internet_quality', 'extracurricular_participation']:
    print(f"{col} unique values:", df[col].unique())

Processed X_train_val shape: torch.Size([800, 25])
Processed X_test shape: torch.Size([200, 25])
gender unique values: ['Female' 'Male' 'Other']
part_time_job unique values: ['No' 'Yes']
diet_quality unique values: ['Fair' 'Good' 'Poor']
parental_education_level unique values: ['Master' 'High School' 'Bachelor' nan]
internet_quality unique values: ['Average' 'Poor' 'Good']
extracurricular_participation unique values: ['Yes' 'No']


In [45]:
# Define models
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

class DeepNet(nn.Module):
    def __init__(self, input_dim):
        super(DeepNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

class DeeperNet(nn.Module):
    def __init__(self, input_dim):
        super(DeeperNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.model(x)

In [46]:
# Accuracy with predictions within ±1 of true score
def classification_accuracy(y_true, y_pred, threshold=60):
    y_true_class = y_true >= threshold
    y_pred_class = y_pred >= threshold
    return np.mean(y_true_class == y_pred_class) * 100

def train_model(model_class, X_tensor, y_tensor, learning_rate, batch_size, epochs=150, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    losses = []
    accuracies = []
    r2s = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_tensor)):
        X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
        y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]

        train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)

        model = model_class(X_tensor.shape[1])
        criterion = nn.MSELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

        for epoch in range(epochs):
            model.train()
            for xb, yb in train_loader:
                optimizer.zero_grad()
                pred = model(xb)
                loss = criterion(pred, yb)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_preds = model(X_val)
            val_preds_np = val_preds.numpy()
            y_val_np = y_val.numpy()

            # Compute metrics
            val_loss = mean_squared_error(y_val_np, val_preds_np)
            val_r2 = r2_score(y_val_np, val_preds_np)
            val_accuracy = classification_accuracy(y_val_np, val_preds_np)

            losses.append(val_loss)
            accuracies.append(val_accuracy)
            r2s.append(val_r2)

    return np.mean(losses), np.mean(r2s), np.mean(accuracies), model

In [47]:
# Test the best model on the test set
def test_model(model, X_test_tensor, y_test_tensor):
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensor)
        test_preds_np = test_preds.numpy().flatten()
        y_test_np = y_test_tensor.numpy().flatten()

        test_mse = mean_squared_error(y_test_np, test_preds_np)
        test_r2 = r2_score(y_test_np, test_preds_np)
        test_accuracy = classification_accuracy(y_test_np, test_preds_np)

    return test_mse, test_r2, test_accuracy

In [48]:
# Hyperparameter tuning
learning_rates = [0.0001, 0.00005]
batch_sizes = [16, 32, 64]
architectures = [SimpleNet, DeepNet, DeeperNet]
k_folds_list = [3, 5]

best_mse = float('inf')
best_mse_params = None
best_mse_model = None
best_mse_test_results = None

best_r2 = -float('inf')
best_r2_params = None
best_r2_model = None
best_r2_test_results = None

best_acc = -float('inf')
best_acc_params = None
best_acc_model = None
best_acc_test_results = None

for k_folds in k_folds_list:
    for lr in learning_rates:
        for bs in batch_sizes:
            for arch in architectures:
                # Evaluate on training set
                print(f"Evaluating {arch.__name__} with lr={lr}, batch_size={bs}, k_folds={k_folds}")
                avg_loss, avg_r2, avg_accuracy, trained_model = train_model(
                    arch, X_train_val_tensor, y_train_val_tensor, lr, bs, k_folds=k_folds
                )
                print(f"Validation MSE: {avg_loss:.4f}, Validation R2: {avg_r2:.4f}, Validation Accuracy: {avg_accuracy:.2f}%")
                
                # Evaluate on test set
                test_mse, test_r2, test_accuracy = test_model(trained_model, X_test_tensor, y_test_tensor)
                print(f"Test MSE: {test_mse:.4f}, Test R2: {test_r2:.4f}, Test Accuracy: {test_accuracy:.2f}%")
                print()
                
                # Update best MSE
                if avg_loss < best_mse:
                    best_mse = avg_loss
                    best_mse_params = {'model': arch.__name__, 'lr': lr, 'batch_size': bs, 'k_folds': k_folds}
                    best_mse_model = trained_model
                    best_mse_test_results = (test_mse, test_r2, test_accuracy)
                
                # Update best R2
                if avg_r2 > best_r2:
                    best_r2 = avg_r2
                    best_r2_params = {'model': arch.__name__, 'lr': lr, 'batch_size': bs, 'k_folds': k_folds}
                    best_r2_model = trained_model
                    best_r2_test_results = (test_mse, test_r2, test_accuracy)
                
                # Update best accuracy
                if avg_accuracy > best_acc:
                    best_acc = avg_accuracy
                    best_acc_params = {'model': arch.__name__, 'lr': lr, 'batch_size': bs, 'k_folds': k_folds}
                    best_acc_model = trained_model
                    best_acc_test_results = (test_mse, test_r2, test_accuracy)

# Print best configurations and test results
print("Best configuration for lowest Validation MSE:", best_mse_params)
print(f"Validation MSE: {best_mse:.4f}")
print(f"Test MSE: {best_mse_test_results[0]:.4f}, Test R2: {best_mse_test_results[1]:.4f}, Test Accuracy: {best_mse_test_results[2]:.2f}%")
print()

print("Best configuration for highest Validation R2:", best_r2_params)
print(f"Validation R2: {best_r2:.4f}")
print(f"Test MSE: {best_r2_test_results[0]:.4f}, Test R2: {best_r2_test_results[1]:.4f}, Test Accuracy: {best_r2_test_results[2]:.2f}%")
print()

print("Best configuration for highest Validation Accuracy:", best_acc_params)
print(f"Validation Accuracy: {best_acc:.2f}%")
print(f"Test MSE: {best_acc_test_results[0]:.4f}, Test R2: {best_acc_test_results[1]:.4f}, Test Accuracy: {best_acc_test_results[2]:.2f}%")

Evaluating SimpleNet with lr=0.0001, batch_size=16, k_folds=3
Validation MSE: 35.3004, Validation R2: 0.8783, Validation Accuracy: 89.25%
Test MSE: 27.8385, Test R2: 0.8914, Test Accuracy: 93.50%

Evaluating DeepNet with lr=0.0001, batch_size=16, k_folds=3
Validation MSE: 49.6682, Validation R2: 0.8286, Validation Accuracy: 89.25%
Test MSE: 40.4743, Test R2: 0.8422, Test Accuracy: 90.50%

Evaluating DeeperNet with lr=0.0001, batch_size=16, k_folds=3
Validation MSE: 50.0153, Validation R2: 0.8279, Validation Accuracy: 88.75%
Test MSE: 45.3082, Test R2: 0.8233, Test Accuracy: 91.00%

Evaluating SimpleNet with lr=0.0001, batch_size=32, k_folds=3
Validation MSE: 31.9586, Validation R2: 0.8898, Validation Accuracy: 91.62%
Test MSE: 27.0384, Test R2: 0.8946, Test Accuracy: 94.00%

Evaluating DeepNet with lr=0.0001, batch_size=32, k_folds=3
Validation MSE: 40.9318, Validation R2: 0.8594, Validation Accuracy: 90.62%
Test MSE: 32.3534, Test R2: 0.8738, Test Accuracy: 93.50%

Evaluating DeeperNe

In [52]:
# Higher learning rates don't work with Deeper networks
learning_rates = [0.001, 0.0005, 0.0001]
batch_sizes = [16, 32, 64]
architectures = [SimpleNet]
k_folds_list = [3, 5]

best_mse = float('inf')
best_mse_params = None
best_mse_model = None
best_mse_test_results = None

best_r2 = -float('inf')
best_r2_params = None
best_r2_model = None
best_r2_test_results = None

best_acc = -float('inf')
best_acc_params = None
best_acc_model = None
best_acc_test_results = None

for k_folds in k_folds_list:
    for lr in learning_rates:
        for bs in batch_sizes:
            for arch in architectures:
                # Evaluate on training set
                print(f"Evaluating {arch.__name__} with lr={lr}, batch_size={bs}, k_folds={k_folds}")
                avg_loss, avg_r2, avg_accuracy, trained_model = train_model(
                    arch, X_train_val_tensor, y_train_val_tensor, lr, bs, k_folds=k_folds
                )
                print(f"Validation MSE: {avg_loss:.4f}, Validation R2: {avg_r2:.4f}, Validation Accuracy: {avg_accuracy:.2f}%")
                
                # Evaluate on test set
                test_mse, test_r2, test_accuracy = test_model(trained_model, X_test_tensor, y_test_tensor)
                print(f"Test MSE: {test_mse:.4f}, Test R2: {test_r2:.4f}, Test Accuracy: {test_accuracy:.2f}%")
                print()
                
                # Update best MSE
                if avg_loss < best_mse:
                    best_mse = avg_loss
                    best_mse_params = {'model': arch.__name__, 'lr': lr, 'batch_size': bs, 'k_folds': k_folds}
                    best_mse_model = trained_model
                    best_mse_test_results = (test_mse, test_r2, test_accuracy)
                
                # Update best R2
                if avg_r2 > best_r2:
                    best_r2 = avg_r2
                    best_r2_params = {'model': arch.__name__, 'lr': lr, 'batch_size': bs, 'k_folds': k_folds}
                    best_r2_model = trained_model
                    best_r2_test_results = (test_mse, test_r2, test_accuracy)
                
                # Update best accuracy
                if avg_accuracy > best_acc:
                    best_acc = avg_accuracy
                    best_acc_params = {'model': arch.__name__, 'lr': lr, 'batch_size': bs, 'k_folds': k_folds}
                    best_acc_model = trained_model
                    best_acc_test_results = (test_mse, test_r2, test_accuracy)

# Print best configurations and test results
print("Best configuration for lowest Validation MSE:", best_mse_params)
print(f"Validation MSE: {best_mse:.4f}")
print(f"Test MSE: {best_mse_test_results[0]:.4f}, Test R2: {best_mse_test_results[1]:.4f}, Test Accuracy: {best_mse_test_results[2]:.2f}%")
print()

print("Best configuration for highest Validation R2:", best_r2_params)
print(f"Validation R2: {best_r2:.4f}")
print(f"Test MSE: {best_r2_test_results[0]:.4f}, Test R2: {best_r2_test_results[1]:.4f}, Test Accuracy: {best_r2_test_results[2]:.2f}%")
print()

print("Best configuration for highest Validation Accuracy:", best_acc_params)
print(f"Validation Accuracy: {best_acc:.2f}%")
print(f"Test MSE: {best_acc_test_results[0]:.4f}, Test R2: {best_acc_test_results[1]:.4f}, Test Accuracy: {best_acc_test_results[2]:.2f}%")

Evaluating SimpleNet with lr=0.001, batch_size=16, k_folds=3
Validation MSE: 56.4267, Validation R2: 0.8060, Validation Accuracy: 89.87%
Test MSE: 48.4196, Test R2: 0.8112, Test Accuracy: 89.00%

Evaluating SimpleNet with lr=0.001, batch_size=32, k_folds=3
Validation MSE: 46.1418, Validation R2: 0.8414, Validation Accuracy: 89.87%
Test MSE: 40.4371, Test R2: 0.8423, Test Accuracy: 91.50%

Evaluating SimpleNet with lr=0.001, batch_size=64, k_folds=3
Validation MSE: 41.3715, Validation R2: 0.8577, Validation Accuracy: 90.37%
Test MSE: 33.7801, Test R2: 0.8683, Test Accuracy: 93.50%

Evaluating SimpleNet with lr=0.0005, batch_size=16, k_folds=3
Validation MSE: 48.5016, Validation R2: 0.8332, Validation Accuracy: 90.00%
Test MSE: 42.0494, Test R2: 0.8360, Test Accuracy: 91.00%

Evaluating SimpleNet with lr=0.0005, batch_size=32, k_folds=3
Validation MSE: 40.8340, Validation R2: 0.8596, Validation Accuracy: 90.87%
Test MSE: 30.3217, Test R2: 0.8818, Test Accuracy: 94.00%

Evaluating SimpleN

In [53]:
# Retrain best model using best hyperparameters
print(f"Retraining with: lr={5e-05}, batch_size={64}, k_folds={5}")

# Retrain model
_, _, _, simplenet_model = train_model(
    SimpleNet,
    X_train_val_tensor,
    y_train_val_tensor,
    5e-05,
    32,
    k_folds=5
)

# Save the trained model
torch.save(simplenet_model.state_dict(), 'simplenet_model.pth')

Retraining with: lr=5e-05, batch_size=64, k_folds=5
