In [2]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "train_processed.csv"

# Load the latest version
train_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "jp234324/test-processed",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", train_df.head())

  train_df = kagglehub.load_dataset(


First 5 records:    Score                                               Text  \
0      5  I received this product early from the seller!...   
1      5  *****<br />Numi's Collection Assortment Melang...   
2      5  I was very careful not to overcook this pasta,...   
3      5  Buying this multi-pack I was misled by the pic...   
4      5  These bars are so good! I loved them warmed up...   

                                      Text_Processed  
0  receive product early seller tastey great midd...  
1  br numis collection assortment melange include...  
2  careful overcook pasta make sure take bite eve...  
3  buying multipack mislead picture whole hazel n...  
4  bar good love warm definitely think great snac...  


In [3]:
import os
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import time
from tqdm import tqdm

In [4]:
def create_model(bert_name, num_classes):
    bert = DistilBertModel.from_pretrained(bert_name)
    dropout = nn.Dropout(0.1)
    classifier = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(input_ids, attention_mask):
        with torch.no_grad():
            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        x = dropout(bert_output)
        return classifier(x)

    # Wrap into a Module for training compatibility
    class ModelWrapper(nn.Module):
        def __init__(self):
            super().__init__()
            self.bert = bert
            self.dropout = dropout
            self.fc = classifier

        def forward(self, input_ids, attention_mask):
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = bert_output.last_hidden_state[:, 0] 
            x = self.dropout(pooled_output)  
            return self.fc(x)  

    return ModelWrapper()

In [22]:
# Collate function for lazy tokenization
def collate_batch(batch):
    texts, labels = zip(*batch)
    encodings = tokenizer(
        list(texts),
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_len
    )
    labels = torch.tensor([label - 1 for label in labels])  # Shift label if needed
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    }

In [18]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [7]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, targets = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    return accuracy_score(targets, predictions), classification_report(targets, predictions)

In [8]:
def predict(text, model, tokenizer, device, max_length):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs, dim=1)
    return pred.item()

In [23]:
texts = train_df['Text_Processed'].tolist()
labels = train_df['Score'].tolist()

# Hyperparameters
bert_model_name = 'distilbert-base-uncased'
num_classes = 5
max_len = 128
batch_size = 128
epochs = 10
learning_rate = 2e-5

tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts = texts
train_labels = labels


train_data = list(zip(train_texts, train_labels))
val_data = list(zip(val_texts, val_labels))

# DataLoaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=batch_size, collate_fn=collate_batch)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = create_model(bert_model_name, num_classes).to(device)

# Optional: Use DataParallel if multiple GPUs are available
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    start = time.time()
    avg_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Average Training Loss: {avg_loss:.4f} | Time: {time.time() - start:.2f}s")

    acc, report = evaluate_model(model, val_loader, device)
    print(f"Validation Accuracy: {acc:.4f}")
    print(report)




Epoch 1/10


                                                             

Average Training Loss: 0.7763 | Time: 2133.14s
Validation Accuracy: 0.7474
              precision    recall  f1-score   support

           0       0.66      0.72      0.69      5644
           1       0.45      0.14      0.21      3214
           2       0.42      0.40      0.41      4679
           3       0.51      0.28      0.36      8688
           4       0.82      0.95      0.88     39602

    accuracy                           0.75     61827
   macro avg       0.57      0.50      0.51     61827
weighted avg       0.71      0.75      0.72     61827


Epoch 2/10


                                                             

Average Training Loss: 0.6802 | Time: 2129.33s
Validation Accuracy: 0.7684
              precision    recall  f1-score   support

           0       0.66      0.82      0.73      5644
           1       0.50      0.22      0.31      3214
           2       0.49      0.43      0.46      4679
           3       0.63      0.20      0.30      8688
           4       0.83      0.97      0.89     39602

    accuracy                           0.77     61827
   macro avg       0.62      0.53      0.54     61827
weighted avg       0.74      0.77      0.73     61827


Epoch 3/10


                                                             

Average Training Loss: 0.6259 | Time: 2128.28s
Validation Accuracy: 0.7952
              precision    recall  f1-score   support

           0       0.73      0.82      0.77      5644
           1       0.54      0.30      0.39      3214
           2       0.54      0.52      0.53      4679
           3       0.62      0.35      0.44      8688
           4       0.86      0.96      0.91     39602

    accuracy                           0.80     61827
   macro avg       0.66      0.59      0.61     61827
weighted avg       0.77      0.80      0.78     61827


Epoch 4/10


                                                             

Average Training Loss: 0.5757 | Time: 2126.64s
Validation Accuracy: 0.8125
              precision    recall  f1-score   support

           0       0.73      0.88      0.80      5644
           1       0.59      0.32      0.42      3214
           2       0.58      0.61      0.59      4679
           3       0.73      0.31      0.43      8688
           4       0.87      0.98      0.92     39602

    accuracy                           0.81     61827
   macro avg       0.70      0.62      0.63     61827
weighted avg       0.80      0.81      0.79     61827


Epoch 5/10


                                                             

Average Training Loss: 0.5292 | Time: 2125.68s
Validation Accuracy: 0.8409
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      5644
           1       0.64      0.48      0.55      3214
           2       0.62      0.71      0.66      4679
           3       0.71      0.46      0.56      8688
           4       0.90      0.96      0.93     39602

    accuracy                           0.84     61827
   macro avg       0.74      0.70      0.71     61827
weighted avg       0.83      0.84      0.83     61827


Epoch 6/10


                                                             

Average Training Loss: 0.4874 | Time: 2123.49s
Validation Accuracy: 0.8623
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      5644
           1       0.67      0.60      0.64      3214
           2       0.68      0.74      0.71      4679
           3       0.77      0.52      0.62      8688
           4       0.91      0.97      0.94     39602

    accuracy                           0.86     61827
   macro avg       0.78      0.74      0.75     61827
weighted avg       0.86      0.86      0.85     61827


Epoch 7/10


                                                             

Average Training Loss: 0.4488 | Time: 2122.63s
Validation Accuracy: 0.8773
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      5644
           1       0.73      0.64      0.68      3214
           2       0.72      0.79      0.76      4679
           3       0.86      0.50      0.63      8688
           4       0.91      0.99      0.95     39602

    accuracy                           0.88     61827
   macro avg       0.82      0.76      0.78     61827
weighted avg       0.87      0.88      0.87     61827


Epoch 8/10


                                                             

Average Training Loss: 0.4178 | Time: 2125.21s
Validation Accuracy: 0.8920
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      5644
           1       0.76      0.65      0.70      3214
           2       0.77      0.80      0.78      4679
           3       0.87      0.58      0.69      8688
           4       0.92      0.99      0.95     39602

    accuracy                           0.89     61827
   macro avg       0.84      0.79      0.80     61827
weighted avg       0.89      0.89      0.89     61827


Epoch 9/10


Training:  20%|█▉        | 476/2416 [06:58<28:31,  1.13it/s]