In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Load the dataset
df = pd.read_csv('Updated_StudentPerformanceFactors_Cleaned.csv')

# Define columns
text_column = 'Teacher_Notes'
categorical_columns = ['Motivation_Level', 'Parental_Involvement']
target_column = 'Exam_Score'

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns),
    ],
    remainder='passthrough'
)

# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    tokens = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    return tokens

# Tokenize the text column
df['tokenized_text'] = df[text_column].apply(lambda x: tokenize_text(x))

# Split dataset
X = df[[text_column] + categorical_columns]  # Use only raw text and categorical columns
y = df[target_column]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Process categorical data
X_train_categorical = preprocessor.fit_transform(X_train[categorical_columns])
X_val_categorical = preprocessor.transform(X_val[categorical_columns])
X_test_categorical = preprocessor.transform(X_test[categorical_columns])

# Convert categorical data to PyTorch tensors
X_train_categorical = torch.tensor(X_train_categorical, dtype=torch.float)
X_val_categorical = torch.tensor(X_val_categorical, dtype=torch.float)
X_test_categorical = torch.tensor(X_test_categorical, dtype=torch.float)

# Prepare tokenized text for BERT input
def prepare_text_input(df):
    input_ids = []
    attention_masks = []
    for _, row in df.iterrows():
        tokens = tokenize_text(row[text_column])
        input_ids.append(tokens['input_ids'].squeeze(0))
        attention_masks.append(tokens['attention_mask'].squeeze(0))
    return torch.stack(input_ids), torch.stack(attention_masks)

X_train_text, X_train_masks = prepare_text_input(X_train)
X_val_text, X_val_masks = prepare_text_input(X_val)
X_test_text, X_test_masks = prepare_text_input(X_test)

# Define the Hybrid Neural Network Model
class StudentPerformanceModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', categorical_input_dim=3, output_dim=1):
        super(StudentPerformanceModel, self).__init__()
        # BERT for text processing
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_hidden_size = self.bert.config.hidden_size

        # Fully connected layer for categorical data
        self.fc_categorical = nn.Linear(categorical_input_dim, 32)

        # Combined output layer
        self.fc_combined = nn.Linear(self.bert_hidden_size + 32, 64)
        self.output_layer = nn.Linear(64, output_dim)

        # Activation functions
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, categorical_data):
        # Process text data with BERT
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_pooled_output = bert_outputs.pooler_output  # [CLS] token representation

        # Process categorical data
        categorical_output = self.relu(self.fc_categorical(categorical_data))

        # Combine outputs
        combined_output = torch.cat((bert_pooled_output, categorical_output), dim=1)
        combined_output = self.relu(self.fc_combined(combined_output))
        combined_output = self.dropout(combined_output)

        # Final output
        output = self.output_layer(combined_output)
        return output

# Initialize Model, Loss Function, and Optimizer
model = StudentPerformanceModel(categorical_input_dim=X_train_categorical.size(1))
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Create DataLoaders
train_dataset = TensorDataset(
    X_train_text,
    X_train_masks,
    X_train_categorical,
    torch.tensor(y_train.values, dtype=torch.float)
)
val_dataset = TensorDataset(
    X_val_text,
    X_val_masks,
    X_val_categorical,
    torch.tensor(y_val.values, dtype=torch.float)
)
test_dataset = TensorDataset(
    X_test_text,
    X_test_masks,
    X_test_categorical,
    torch.tensor(y_test.values, dtype=torch.float)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training Loop
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            categorical_data = batch[2].to(device)
            targets = batch[3].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, categorical_data)
            loss = criterion(outputs.squeeze(), targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                categorical_data = batch[2].to(device)
                targets = batch[3].to(device)

                outputs = model(input_ids, attention_mask, categorical_data)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss / len(val_loader):.4f}")

# Train the model
train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/10, Training Loss: 3229.9707
Epoch 1/10, Validation Loss: 1878.4407
Epoch 2/10, Training Loss: 952.0929
Epoch 2/10, Validation Loss: 265.6756
Epoch 3/10, Training Loss: 150.3779
Epoch 3/10, Validation Loss: 23.2517
Epoch 4/10, Training Loss: 87.4065
Epoch 4/10, Validation Loss: 16.0049
Epoch 5/10, Training Loss: 85.1092
Epoch 5/10, Validation Loss: 15.9359
Epoch 6/10, Training Loss: 89.0840
Epoch 6/10, Validation Loss: 15.6086
Epoch 7/10, Training Loss: 84.5572
Epoch 7/10, Validation Loss: 14.8467
Epoch 8/10, Training Loss: 94.9182
Epoch 8/10, Validation Loss: 16.0709
Epoch 9/10, Training Loss: 85.1734
Epoch 9/10, Validation Loss: 15.5845
Epoch 10/10, Training Loss: 82.3322
Epoch 10/10, Validation Loss: 15.6745


In [6]:
import pandas as pd

def test_model(model, text_input, categorical_input):
    model.eval()  # Set the model to evaluation mode

    # Tokenize the text input
    tokenized_input = tokenizer(
        text_input,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    input_ids = tokenized_input['input_ids'].to(device)
    attention_mask = tokenized_input['attention_mask'].to(device)

    # Encode categorical input
    categorical_df = pd.DataFrame([categorical_input], columns=categorical_columns)
    categorical_input_encoded = preprocessor.transform(categorical_df)
    categorical_tensor = torch.tensor(categorical_input_encoded, dtype=torch.float).to(device)

    # Forward pass through the model
    with torch.no_grad():
        output = model(input_ids, attention_mask, categorical_tensor)

    # Output the prediction
    return output.squeeze().item()

# Example inputs
text_input = "Has a balanced study schedule. Maintains healthy sleep habits. Requires support to boost motivation."
categorical_input = ['Low', 'Low']  # Example motivation and parental involvement levels

# Get the prediction
predicted_score = test_model(model, text_input, categorical_input)
print(f"Predicted Exam Score: {predicted_score}")


Predicted Exam Score: 66.10328674316406
