In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertModel, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

2025-04-19 11:33:46.129466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745062426.324321      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745062426.390306      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Load train dataset
train_df = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding='ISO-8859-1')
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [4]:
# Load test dataset
test_df = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/test.csv", encoding='ISO-8859-1')
test_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [5]:
# Preprocess labels
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
train_df = train_df[train_df['sentiment'].isin(label_map.keys())]  # Filter valid sentiments
train_df['label'] = train_df['sentiment'].map(label_map)

In [6]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),label
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60,1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105,0
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26,0


In [7]:
# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].tolist(), train_df['label'].tolist(), test_size=0.2, random_state=42)

In [8]:
# Ensure text data is string and drop missing values
filtered = [(str(text), label) for text, label in zip(train_texts, train_labels) if pd.notnull(text)]
train_texts, train_labels = zip(*filtered)
train_texts = list(train_texts)
train_labels = list(train_labels)


In [9]:
# val_texts = [str(text) for text in val_texts if pd.notnull(text)]
filtered = [(str(text), label) for text, label in zip(val_texts, val_labels) if pd.notnull(text)]
val_texts, val_labels = zip(*filtered)
val_texts = list(val_texts)
val_labels = list(val_labels)


In [10]:
len(train_texts)

21983

In [11]:
len(train_labels)

21983

In [14]:
# Preprocess labels
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
test_df = test_df[test_df['sentiment'].isin(label_map.keys())]  # Filter valid sentiments
test_df['label'] = test_df['sentiment'].map(label_map)

In [15]:
test_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),label
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,1
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,2
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,2
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,2


In [17]:
test_df = test_df[pd.notnull(test_df['text'])]

In [18]:
# Extract text and sentiment 
test_texts = list(test_df['text'])
test_labels = list(test_df['label'])

In [19]:
len(test_texts)

3534

In [20]:
len(test_labels)

3534

In [21]:
# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [22]:
# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [23]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # keep as list

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [24]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

## Training data on pretrained Bert-based model

In [25]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [26]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [27]:
def train(model, train_dataset, val_dataset, epochs=3, batch_size=16):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        loop = tqdm(train_loader, leave=True)

        for batch in loop:
            # ✅ Now this works, because each batch is a dict with tensors
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            loop.set_description(f"Epoch {epoch + 1}")
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch + 1} complete. Avg Loss = {avg_loss:.4f}")

        evaluate(model, val_loader)
    return model

In [28]:
# Evaluation
def evaluate(model, val_loader):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    print("\nClassification Report:\n")
    print(classification_report(true_labels, preds, target_names=label_map.keys()))

In [29]:
# Run training
model = train(model, train_dataset, val_dataset)

Epoch 1: 100%|██████████| 1374/1374 [04:10<00:00,  5.48it/s, loss=0.464]



Epoch 1 complete. Avg Loss = 0.6009

Classification Report:

              precision    recall  f1-score   support

    negative       0.86      0.65      0.74      1562
     neutral       0.69      0.83      0.75      2230
    positive       0.83      0.79      0.81      1705

    accuracy                           0.77      5497
   macro avg       0.79      0.76      0.77      5497
weighted avg       0.78      0.77      0.77      5497



Epoch 2: 100%|██████████| 1374/1374 [04:10<00:00,  5.49it/s, loss=0.534] 



Epoch 2 complete. Avg Loss = 0.4266

Classification Report:

              precision    recall  f1-score   support

    negative       0.80      0.80      0.80      1562
     neutral       0.78      0.75      0.76      2230
    positive       0.81      0.85      0.83      1705

    accuracy                           0.80      5497
   macro avg       0.80      0.80      0.80      5497
weighted avg       0.79      0.80      0.79      5497



Epoch 3: 100%|██████████| 1374/1374 [04:10<00:00,  5.49it/s, loss=0.132] 



Epoch 3 complete. Avg Loss = 0.2934

Classification Report:

              precision    recall  f1-score   support

    negative       0.80      0.76      0.78      1562
     neutral       0.72      0.79      0.75      2230
    positive       0.84      0.78      0.81      1705

    accuracy                           0.78      5497
   macro avg       0.79      0.78      0.78      5497
weighted avg       0.78      0.78      0.78      5497



In [30]:
# Results on validation data

val_loader = DataLoader(val_dataset, batch_size=16)
evaluate(model, val_loader)


Classification Report:

              precision    recall  f1-score   support

    negative       0.80      0.76      0.78      1562
     neutral       0.72      0.79      0.75      2230
    positive       0.84      0.78      0.81      1705

    accuracy                           0.78      5497
   macro avg       0.79      0.78      0.78      5497
weighted avg       0.78      0.78      0.78      5497



In [31]:
# Results on test data

test_loader = DataLoader(test_dataset, batch_size=16)
evaluate(model, test_loader)


Classification Report:

              precision    recall  f1-score   support

    negative       0.78      0.77      0.78      1001
     neutral       0.72      0.78      0.75      1430
    positive       0.86      0.78      0.82      1103

    accuracy                           0.78      3534
   macro avg       0.79      0.78      0.78      3534
weighted avg       0.78      0.78      0.78      3534



In [40]:
torch.save(model.state_dict(), "/kaggle/working/bert_model.pth")

## Fine-tuning the pretrained Bert model and training it on dataset

In [32]:
# Fine-tuned model after adding extra layers to the original classification model

class CustomBERTClassifier(nn.Module):
    def __init__(self, dropout=0.3, hidden_size=768, num_labels=3):
        super(CustomBERTClassifier, self).__init__()

        # Start with the original classification model
        self.bert_fc = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

        # Add extra custom layers after BERT's classification output
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(num_labels, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.bert_fc(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )

        # Get logits from BERT
        logits = outputs.logits  # shape: (batch_size, num_labels)

        # Pass through custom classifier head
        logits = self.classifier(logits)

        # Compute loss if labels provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {'loss': loss, 'logits': logits}


In [33]:
model_tuned = CustomBERTClassifier()
model_tuned.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomBERTClassifier(
  (bert_fc): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_f

In [34]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [35]:
def train(model, train_dataset, val_dataset, epochs=3, batch_size=16):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        loop = tqdm(train_loader, leave=True)

        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs['loss']  # ✅ Access loss from dict

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            loop.set_description(f"Epoch {epoch + 1}")
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch + 1} complete. Avg Loss = {avg_loss:.4f}")

        evaluate(model, val_loader)

    return model


In [36]:
def evaluate(model, dataloader):
    model.eval()
    preds = []
    labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            logits = outputs['logits']  # ✅ correct access
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())

    print("\nValidation Set Classification Report:\n")
    print(classification_report(labels, preds, target_names=['negative', 'neutral', 'positive']))

In [37]:
model_tuned = train(model_tuned, train_dataset, val_dataset)

Epoch 1: 100%|██████████| 1374/1374 [03:50<00:00,  5.95it/s, loss=1.16] 



Epoch 1 complete. Avg Loss = 1.1516


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation Set Classification Report:

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      1562
     neutral       0.00      0.00      0.00      2230
    positive       0.31      1.00      0.47      1705

    accuracy                           0.31      5497
   macro avg       0.10      0.33      0.16      5497
weighted avg       0.10      0.31      0.15      5497



Epoch 2: 100%|██████████| 1374/1374 [03:50<00:00,  5.95it/s, loss=1.2]  



Epoch 2 complete. Avg Loss = 1.1532


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation Set Classification Report:

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      1562
     neutral       0.00      0.00      0.00      2230
    positive       0.31      1.00      0.47      1705

    accuracy                           0.31      5497
   macro avg       0.10      0.33      0.16      5497
weighted avg       0.10      0.31      0.15      5497



Epoch 3: 100%|██████████| 1374/1374 [03:51<00:00,  5.95it/s, loss=1.01] 



Epoch 3 complete. Avg Loss = 1.1526

Validation Set Classification Report:

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      1562
     neutral       0.00      0.00      0.00      2230
    positive       0.31      1.00      0.47      1705

    accuracy                           0.31      5497
   macro avg       0.10      0.33      0.16      5497
weighted avg       0.10      0.31      0.15      5497



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
# Results on validation data

val_loader = DataLoader(val_dataset, batch_size=16)
evaluate(model_tuned, val_loader)


Validation Set Classification Report:

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      1562
     neutral       0.00      0.00      0.00      2230
    positive       0.31      1.00      0.47      1705

    accuracy                           0.31      5497
   macro avg       0.10      0.33      0.16      5497
weighted avg       0.10      0.31      0.15      5497



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# Results on validation data

test_loader = DataLoader(test_dataset, batch_size=16)
evaluate(model_tuned, test_loader)


Validation Set Classification Report:

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      1001
     neutral       0.00      0.00      0.00      1430
    positive       0.31      1.00      0.48      1103

    accuracy                           0.31      3534
   macro avg       0.10      0.33      0.16      3534
weighted avg       0.10      0.31      0.15      3534



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
torch.save(model_tuned.state_dict(), "/kaggle/working/finetuned_bert_model.pth")

### We can see from the results that the original pretrained Bert model is giving better results when trained on sentiment dataset, rather than the finetuned Bert model.