In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer
import torch

# Load the dataset
df = pd.read_csv('Updated_StudentPerformanceFactors_Cleaned.csv')
df.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,...,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score,Teacher_Notes
0,23,84,Low,High,No,7,73,Low,Yes,0,...,Medium,Public,Positive,3,No,High School,Near,Male,67,Has a balanced study schedule. Maintains healt...
1,19,64,Low,Medium,No,8,59,Low,Yes,2,...,Medium,Public,Negative,4,No,College,Moderate,Female,61,Has a balanced study schedule. Maintains healt...
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,...,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74,Has a balanced study schedule. Maintains healt...
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,...,Medium,Public,Negative,4,No,High School,Moderate,Male,71,Puts in excellent study efforts. Maintains hea...
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,...,High,Public,Neutral,4,No,College,Near,Female,70,Has a balanced study schedule. Maintains healt...


In [2]:
text_column = 'Teacher_Notes'
categorical_columns = ['Motivation_Level', 'Parental_Involvement']
target_column = 'Exam_Score'

# Step 1: Preprocess Text Data
# Initialize BERT tokenizer


In [3]:
# Step 2: Preprocess Categorical Data
# One-hot encode categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')



In [4]:
# Step 3: Combine Preprocessing
# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns),
    ],
    remainder='passthrough'
)



In [5]:
# Step 4: Split the Data
#X = df[[text_column] + categorical_columns]
#y = df[target_column]

## Split into training, validation, and test sets
#X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
#X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Step 1: Preprocess Text Data
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    tokens = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    return tokens

# Apply tokenization to the dataset
df['tokenized_text'] = df[text_column].apply(lambda x: tokenize_text(x))

# Step 2: Split the Data





In [6]:

X = df[[text_column, 'tokenized_text'] + categorical_columns]
y = df[target_column]

# Split into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Prepare tokenized text for BERT input
def prepare_text_input(df):
    input_ids = []
    attention_masks = []
    for i in df['tokenized_text']:
        input_ids.append(i['input_ids'].squeeze(0))
        attention_masks.append(i['attention_mask'].squeeze(0))
    return torch.stack(input_ids), torch.stack(attention_masks)



In [7]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,...,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score,Teacher_Notes,tokenized_text
0,23,84,Low,High,No,7,73,Low,Yes,0,...,Public,Positive,3,No,High School,Near,Male,67,Has a balanced study schedule. Maintains healt...,"[input_ids, token_type_ids, attention_mask]"
1,19,64,Low,Medium,No,8,59,Low,Yes,2,...,Public,Negative,4,No,College,Moderate,Female,61,Has a balanced study schedule. Maintains healt...,"[input_ids, token_type_ids, attention_mask]"
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,...,Public,Neutral,4,No,Postgraduate,Near,Male,74,Has a balanced study schedule. Maintains healt...,"[input_ids, token_type_ids, attention_mask]"
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,...,Public,Negative,4,No,High School,Moderate,Male,71,Puts in excellent study efforts. Maintains hea...,"[input_ids, token_type_ids, attention_mask]"
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,...,Public,Neutral,4,No,College,Near,Female,70,Has a balanced study schedule. Maintains healt...,"[input_ids, token_type_ids, attention_mask]"


In [8]:
X_train_text, X_train_masks = prepare_text_input(X_train)
X_val_text, X_val_masks = prepare_text_input(X_val)
X_test_text, X_test_masks = prepare_text_input(X_test)

In [9]:
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel

# Define the Hybrid Neural Network Model
class StudentPerformanceModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', categorical_input_dim=3, output_dim=1):
        super(StudentPerformanceModel, self).__init__()
        # BERT for text processing
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # Fully connected layer for categorical data
        self.fc_categorical = nn.Linear(categorical_input_dim, 32)
        
        # Combined output layer
        self.fc_combined = nn.Linear(self.bert_hidden_size + 32, 64)
        self.output_layer = nn.Linear(64, output_dim)
        
        # Activation functions
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, categorical_data):
        # Process text data with BERT
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_pooled_output = bert_outputs.pooler_output  # [CLS] token representation
        
        # Process categorical data
        categorical_output = self.relu(self.fc_categorical(categorical_data))
        
        # Combine outputs
        combined_output = torch.cat((bert_pooled_output, categorical_output), dim=1)
        combined_output = self.relu(self.fc_combined(combined_output))
        combined_output = self.dropout(combined_output)
        
        # Final output
        output = self.output_layer(combined_output)
        return output

# Initialize Model, Loss Function, and Optimizer
model = StudentPerformanceModel(categorical_input_dim=len(categorical_columns))
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training Loop
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            categorical_data = batch['categorical_data'].to(device)
            targets = batch['targets'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, categorical_data)
            loss = criterion(outputs.squeeze(), targets)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss / len(train_loader):.4f}")
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                categorical_data = batch['categorical_data'].to(device)
                targets = batch['targets'].to(device)
                
                outputs = model(input_ids, attention_mask, categorical_data)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss / len(val_loader):.4f}")

# Example: Call train_model (Make sure to create DataLoaders for train_loader and val_loader)
# train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10)


KeyboardInterrupt: 