In [2]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "train_processed.csv"

# Load the latest version
train_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "buddycyph/train-processed",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", train_df.head())

  train_df = kagglehub.load_dataset(


First 5 records:    Score                                               Text  \
0      5  I received this product early from the seller!...   
1      5  *****<br />Numi's Collection Assortment Melang...   
2      5  I was very careful not to overcook this pasta,...   
3      5  Buying this multi-pack I was misled by the pic...   
4      5  These bars are so good! I loved them warmed up...   

                                      Text_Processed  
0  receive product early seller tastey great midd...  
1  br numis collection assortment melange include...  
2  careful overcook pasta make sure take bite eve...  
3  buying multipack mislead picture whole hazel n...  
4  bar good love warm definitely think great snac...  


In [1]:
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import time
from tqdm import tqdm

In [4]:
def create_model(bert_name, num_classes):
    bert = BertModel.from_pretrained(bert_name)
    dropout = nn.Dropout(0.1)
    classifier = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(input_ids, attention_mask):
        with torch.no_grad():
            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        x = dropout(bert_output)
        return classifier(x)

    # Wrap into a Module for training compatibility
    class ModelWrapper(nn.Module):
        def __init__(self):
            super().__init__()
            self.bert = bert
            self.dropout = dropout
            self.fc = classifier

        def forward(self, input_ids, attention_mask):
            output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
            x = self.dropout(output)
            return self.fc(x)

    return ModelWrapper()

In [11]:
# Collate function for lazy tokenization
def collate_batch(batch):
    texts, labels = zip(*batch)
    encodings = tokenizer(
        list(texts),
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_len
    )
    labels = torch.tensor([label - 1 for label in labels])  # Shift label if needed
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    }

In [14]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


In [15]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, targets = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    return accuracy_score(targets, predictions), classification_report(targets, predictions)

In [8]:
def predict(text, model, tokenizer, device, max_length):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs, dim=1)
    return pred.item()

In [12]:
texts = train_df['Text'].tolist()
labels = train_df['Score'].tolist()

# Hyperparameters
bert_model_name = 'bert-base-uncased'
num_classes = 5
max_len = 128
batch_size = 32
epochs = 5
learning_rate = 2e-5

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Prepare raw samples (lazy style)
train_data = list(zip(train_texts, train_labels))
val_data = list(zip(val_texts, val_labels))

# DataLoaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=batch_size, collate_fn=collate_batch)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = create_model(bert_model_name, num_classes).to(device)

# Optional: Use DataParallel if multiple GPUs are available
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    start = time.time()
    avg_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Average Training Loss: {avg_loss:.4f} | Time: {time.time() - start:.2f}s")

    acc, report = evaluate_model(model, val_loader, device)
    print(f"Validation Accuracy: {acc:.4f}")
    print(report)


Epoch 1/5


                                                             

KeyboardInterrupt: 

In [None]:
import os
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import time
from tqdm import tqdm

class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
      super(BERTClassifier, self).__init__()
      self.bert = BertModel.from_pretrained(bert_model_name)
      self.dropout = nn.Dropout(0.1)
      self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
      pooled_output = outputs.pooler_output
      x = self.dropout(pooled_output)
      logits = self.fc(x)
      return logits
  
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
          self.texts = texts
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
      return len(self.texts)
  def __getitem__(self, idx):
      text = str(self.texts[idx])
      label = self.labels[idx]
      encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label-1)}
  
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    epoch_start_time = time.time()  # Start time for the epoch
    for i, batch in enumerate(tqdm(data_loader, desc="Training", leave=False)):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print batch progress every 100th batch
        if i % 100 == 0:
            print(f"Batch: {i}, Loss: {loss.item():.4f}")

    epoch_end_time = time.time()  # End time for the epoch
    epoch_duration = epoch_end_time - epoch_start_time
    print(f"Epoch time: {epoch_duration:.2f} seconds")

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return preds.item()
    
texts = train_df['Text'].tolist()
labels = train_df['Score'].tolist()

bert_model_name = 'bert-base-uncased'
num_classes = 5
max_length = 128
batch_size = 128
num_epochs = 10
learning_rate = 2e-5

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
#Training on the entire dataset while keeping the validation same.
train_texts = texts
train_labels = labels

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model and move it to the device
model = BERTClassifier(bert_model_name, num_classes).to(device)

# Wrap the model with DataParallel to use both GPUs
model = nn.DataParallel(model)
# If you are doing the load and train thing, use this to load:
# model.load_state_dict(torch.load(f"/bert{epoch}e.pt"))
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Training phase
    train_start_time = time.time()
    train(model, train_dataloader, optimizer, scheduler, device)
    train_end_time = time.time()
    train_duration = train_end_time - train_start_time
    print(f"Training time: {train_duration:.2f} seconds")
    
    # Evaluation phase
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)


Epoch 1/10


Training:   0%|          | 1/2416 [00:01<1:07:09,  1.67s/it]

Batch: 0, Loss: 1.5496


Training:   4%|▍         | 101/2416 [02:50<1:05:06,  1.69s/it]

Batch: 100, Loss: 0.6323


Training:   8%|▊         | 201/2416 [05:40<1:02:46,  1.70s/it]

Batch: 200, Loss: 0.6157


Training:  12%|█▏        | 301/2416 [08:31<59:58,  1.70s/it]  

Batch: 300, Loss: 0.6132


Training:  17%|█▋        | 401/2416 [11:21<57:31,  1.71s/it]  

Batch: 400, Loss: 0.7654


Training:  21%|██        | 501/2416 [14:11<54:09,  1.70s/it]

Batch: 500, Loss: 0.7178


Training:  25%|██▍       | 601/2416 [17:01<51:59,  1.72s/it]

Batch: 600, Loss: 0.7990


Training:  29%|██▉       | 701/2416 [19:50<48:53,  1.71s/it]

Batch: 700, Loss: 0.7836


Training:  33%|███▎      | 801/2416 [22:40<45:29,  1.69s/it]

Batch: 800, Loss: 0.7393


Training:  37%|███▋      | 901/2416 [25:31<43:07,  1.71s/it]

Batch: 900, Loss: 0.6151


Training:  41%|████▏     | 1001/2416 [28:21<40:16,  1.71s/it]

Batch: 1000, Loss: 0.6327


Training:  46%|████▌     | 1101/2416 [31:11<38:12,  1.74s/it]

Batch: 1100, Loss: 0.7579


Training:  50%|████▉     | 1201/2416 [34:02<34:31,  1.71s/it]

Batch: 1200, Loss: 0.7761


Training:  54%|█████▍    | 1301/2416 [36:53<31:40,  1.70s/it]

Batch: 1300, Loss: 0.6520


Training:  58%|█████▊    | 1401/2416 [39:43<28:51,  1.71s/it]

Batch: 1400, Loss: 0.5906


Training:  62%|██████▏   | 1501/2416 [42:33<26:10,  1.72s/it]

Batch: 1500, Loss: 0.5777


Training:  66%|██████▋   | 1601/2416 [45:23<23:03,  1.70s/it]

Batch: 1600, Loss: 0.5016


Training:  70%|███████   | 1701/2416 [48:14<20:22,  1.71s/it]

Batch: 1700, Loss: 0.5183


Training:  75%|███████▍  | 1801/2416 [51:04<17:29,  1.71s/it]

Batch: 1800, Loss: 0.4993


Training:  79%|███████▊  | 1901/2416 [53:54<14:36,  1.70s/it]

Batch: 1900, Loss: 0.5136


Training:  83%|████████▎ | 2001/2416 [56:45<11:59,  1.73s/it]

Batch: 2000, Loss: 0.6898


Training:  87%|████████▋ | 2101/2416 [59:35<08:55,  1.70s/it]

Batch: 2100, Loss: 0.6723


Training:  91%|█████████ | 2201/2416 [1:02:25<06:05,  1.70s/it]

Batch: 2200, Loss: 0.6486


Training:  95%|█████████▌| 2301/2416 [1:05:15<03:14,  1.69s/it]

Batch: 2300, Loss: 0.5657


Training:  99%|█████████▉| 2401/2416 [1:08:05<00:25,  1.69s/it]

Batch: 2400, Loss: 0.5816


                                                               

Epoch time: 4109.83 seconds
Training time: 4109.83 seconds
Validation Accuracy: 0.7935
              precision    recall  f1-score   support

           0       0.69      0.84      0.76      5644
           1       0.52      0.30      0.38      3214
           2       0.55      0.49      0.52      4679
           3       0.58      0.39      0.47      8688
           4       0.87      0.95      0.91     39602

    accuracy                           0.79     61827
   macro avg       0.64      0.60      0.61     61827
weighted avg       0.77      0.79      0.78     61827


Epoch 2/10


Training:   0%|          | 1/2416 [00:01<1:09:08,  1.72s/it]

Batch: 0, Loss: 0.5725


Training:   4%|▍         | 101/2416 [02:52<1:05:40,  1.70s/it]

Batch: 100, Loss: 0.6504


Training:   8%|▊         | 201/2416 [05:42<1:02:32,  1.69s/it]

Batch: 200, Loss: 0.5743


Training:  12%|█▏        | 301/2416 [08:31<59:38,  1.69s/it]  

Batch: 300, Loss: 0.5608


Training:  17%|█▋        | 401/2416 [11:21<57:02,  1.70s/it]  

Batch: 400, Loss: 0.5865


Training:  21%|██        | 501/2416 [14:10<54:06,  1.70s/it]

Batch: 500, Loss: 0.5420


Training:  25%|██▍       | 601/2416 [17:01<51:37,  1.71s/it]

Batch: 600, Loss: 0.5181


Training:  28%|██▊       | 677/2416 [19:10<49:01,  1.69s/it]