In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pathlib as Path


In [2]:
torch.manual_seed(40028922)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
df_train = pd.read_csv("train.csv")
df_train.head(1)

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3


In [4]:
num_cols = df_train.select_dtypes(include='number').columns
df_train[num_cols] = df_train[num_cols].astype('float32')

In [5]:
cat_vars = df_train.select_dtypes(include='object').columns
cat_vars

Index(['gender', 'course', 'internet_access', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty'],
      dtype='object')

In [6]:
df_train = pd.get_dummies(df_train, columns=cat_vars, drop_first=True)

In [7]:
df_train = df_train.astype('float32')

In [8]:
df_train = df_train.drop('id', axis=1)

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   age                         630000 non-null  float32
 1   study_hours                 630000 non-null  float32
 2   class_attendance            630000 non-null  float32
 3   sleep_hours                 630000 non-null  float32
 4   exam_score                  630000 non-null  float32
 5   gender_male                 630000 non-null  float32
 6   gender_other                630000 non-null  float32
 7   course_b.sc                 630000 non-null  float32
 8   course_b.tech               630000 non-null  float32
 9   course_ba                   630000 non-null  float32
 10  course_bba                  630000 non-null  float32
 11  course_bca                  630000 non-null  float32
 12  course_diploma              630000 non-null  float32
 13  internet_acces

In [10]:
X = df_train.drop('exam_score', axis=1)
y = df_train['exam_score']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40028922, test_size=0.3)

In [12]:
X_train_t = torch.from_numpy(X_train.values)
X_test_t = torch.from_numpy(X_test.values)

y_train_t = torch.from_numpy(y_train.values).reshape(-1, 1)
y_test_t = torch.from_numpy(y_test.values).reshape(-1, 1)

In [13]:
X_train_t = X_train_t.to(device)
X_test_t  = X_test_t.to(device)

y_train_t = y_train_t.to(device)
y_test_t  = y_test_t.to(device)

In [32]:
def __init__(self, X, y):
    self.X = X
    self.y = y


def __len__(self):
    return len(self.X)


def __getitem__(self, idx):
    return self.X[idx], self.y[idx]




class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]



train_dataset = TabularDataset(X_train_t, y_train_t)
test_dataset = TabularDataset(X_test_t, y_test_t)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=256, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle = True)

import torch.nn as nn

class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,1)
        )
    
    
    def forward(self, x):
        return self.net(x)


input_dim = X_train_t.shape[1]
model = MLPRegressor(input_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0.0
    

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    print(f'Epoch {epoch+1}/{EPOCHS} - Train MSE: {epoch_loss:.4f}')


Epoch 1/20 - Train MSE: 151.5763
Epoch 2/20 - Train MSE: 81.0352
Epoch 3/20 - Train MSE: 80.6905
Epoch 4/20 - Train MSE: 80.3277
Epoch 5/20 - Train MSE: 80.2422
Epoch 6/20 - Train MSE: 79.9178
Epoch 7/20 - Train MSE: 79.9538
Epoch 8/20 - Train MSE: 79.8732
Epoch 9/20 - Train MSE: 79.8945
Epoch 10/20 - Train MSE: 79.6853
Epoch 11/20 - Train MSE: 79.7394
Epoch 12/20 - Train MSE: 79.5988
Epoch 13/20 - Train MSE: 79.6200
Epoch 14/20 - Train MSE: 79.6371
Epoch 15/20 - Train MSE: 79.6726
Epoch 16/20 - Train MSE: 79.5094
Epoch 17/20 - Train MSE: 79.5424
Epoch 18/20 - Train MSE: 79.5401
Epoch 19/20 - Train MSE: 79.4367
Epoch 20/20 - Train MSE: 79.4470
