In [14]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import numpy as np
import pandas as pd
import torch
import string
import re
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from sklearn.preprocessing import LabelEncoder
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
training_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
validation_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', header=None)
print(training_df.shape, validation_df.shape)

print(f'Shape before dropping nulls {training_df.shape}')
training_df = training_df.dropna()
validation_df = validation_df.dropna()
print(f'Shape after dropping nulls {training_df.shape}')



(74682, 4) (1000, 4)
Shape before dropping nulls (74682, 4)
Shape after dropping nulls (73996, 4)


In [4]:
import re

def clean_text(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    
    # Remove mentions and hashtags
    tweet = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', tweet)
    
    # Remove special characters, numbers, and punctuation
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)
    
    # Remove 'RT' (Retweet) indicator
    tweet = re.sub(r'\bRT\b', '', tweet)
    
    return tweet.lower()

In [5]:
training_df.loc[:, 3] = training_df[3].apply(clean_text)
validation_df.loc[:, 3] = validation_df[3].apply(clean_text)

In [6]:
le = LabelEncoder()
le.fit(training_df[2])

training_df['Labels']  = le.transform(training_df[2])
training_output_y = training_df['Labels'].tolist()

validation_df['Labels']  = le.transform(validation_df[2])
validation_output_y = validation_df['Labels'].tolist()

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_)) 

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_encodings = tokenizer(training_df[3].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
test_encodings = tokenizer(validation_df[3].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')

In [9]:
training_df

Unnamed: 0,0,1,2,3,Labels
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,3
1,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,3
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,3
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,3
4,2401,Borderlands,Positive,im getting on borderlands and i will murder y...,3
...,...,...,...,...,...
74677,9200,Nvidia,Positive,just realized that the windows partition of my...,3
74678,9200,Nvidia,Positive,just realized that my mac window partition is ...,3
74679,9200,Nvidia,Positive,just realized the windows partition of my mac ...,3
74680,9200,Nvidia,Positive,just realized between the windows partition of...,3


In [10]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(training_df['Labels'].tolist()))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(validation_df['Labels'].tolist()))

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [11]:
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [13]:
model.to(device)
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average Training Loss: {avg_train_loss}')

# Evaluation
model.eval()
predictions = []
true_labels = []

for batch in tqdm(test_dataloader, desc='Evaluating'):
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    _, predicted_labels = torch.max(logits, dim=1)

    predictions.extend(predicted_labels.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


Epoch 1/3: 100%|██████████| 9250/9250 [37:50<00:00,  4.07it/s]


Average Training Loss: 0.8235388864713746


Epoch 2/3: 100%|██████████| 9250/9250 [37:51<00:00,  4.07it/s]


Average Training Loss: 0.34332437679322586


Epoch 3/3: 100%|██████████| 9250/9250 [37:54<00:00,  4.07it/s]


Average Training Loss: 0.17270297830032713


Evaluating: 100%|██████████| 125/125 [00:04<00:00, 27.92it/s]


NameError: name 'accuracy_score' is not defined

In [15]:
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 97.50%
