# BERT Model Implementation

### Importing Necessary libraries

In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree

### Setting the Device to GPU

In [4]:
print(torch.cuda.is_available())
device = torch.device("cuda:0")

True


### Importing Datasets

In [8]:
import pandas as pd    
dev = pd.read_json(path_or_buf='/kaggle/input/nlp-shared-task-dataset/subtaskA_dev_monolingual.jsonl', lines=True)
train = pd.read_json(path_or_buf='/kaggle/input/nlp-shared-task-dataset/subtaskA_train_monolingual.jsonl', lines=True)

### Preprocessing the Datasets

In [9]:
    train_data_1s=[]
    train_data_0s=[]
    train_source_1s=[]
    train_source_0s=[]
    train_text_list=train['text'].tolist()
    train_source_list=train['source'].tolist()
    train_label_list=train['label'].tolist()
    for i in range(len(train)):
        if train_label_list[i]==1:
            train_data_1s.append(train_text_list[i])
            train_source_1s.append(train_source_list[i])
        if train_label_list[i]==0:
            train_data_0s.append(train_text_list[i])
            train_source_0s.append(train_source_list[i])

    dev_text_list=dev['text'].tolist()
    dev_label_list=dev['label'].tolist()
    dev_source_list=dev['source'].tolist()
    dev_data_1s=[]
    dev_data_0s=[]
    dev_source_1s=[]
    dev_source_0s=[]
    for i in range(len(dev)):
        if dev_label_list[i]==1:
            dev_data_1s.append(dev_text_list[i])
            dev_source_1s.append(dev_source_list[i])
        if dev_label_list[i]==0:
            dev_data_0s.append(dev_text_list[i])
            dev_source_0s.append(dev_source_list[i])
    print(len(train_data_1s),len(train_data_0s),len(train_source_1s),len(train_source_0s))
    train_1s={'text':train_data_1s,'source':train_source_1s}
    train_0s={'text':train_data_0s,'source':train_source_0s}
    train_1s = pd.DataFrame(train_1s)
    train_0s = pd.DataFrame(train_0s)

56406 63351 56406 63351


In [10]:
import random
def retrieveData(records):
    train_sample_1s=train_1s.sample(n=records)
    train_sample_0s=train_0s.sample(n=records)
    train_text=train_sample_1s['text'].tolist()+train_sample_0s['text'].tolist()
    train_label=[1 for i in range(records)]+ [0 for i in range(records)]
    train_source=train_sample_1s['source'].tolist()+train_sample_0s['source'].tolist()
    print(len(train_text),len(train_label),len(train_source))
    return train_text,train_label,train_source
train_text,train_label,train_source = retrieveData(20000)
#dev_text=dev_data_1s[:2000]+dev_data_0s[:2000]
#dev_label=[1 for i in range(2000)]+ [0 for i in range(2000)]
#dev_source=dev_source_1s[:2000]+dev_source_0s[:2000]

40000 40000 40000


In [11]:
train_df={'text':train_text,'source':train_source,'label': train_label}
#dev_df={'text':dev_text,'source':dev_source}
train_df = pd.DataFrame(train_df)
#dev_df = pd.DataFrame(dev_df)
train_df=train_df.sample(frac=1)
train_label=train_df['label'].tolist()
train_df={'text':train_df['text'],'source':train_df['source']}
train_df = pd.DataFrame(train_df)
print(len(train_df),len(train_label))

40000 40000


## Spliting the Dataset

In [5]:
#Train Data - 80% Validation Data: 20%

train_data, val_data, train_labels, val_labels = train_test_split(train_df, train_label, test_size=0.2, random_state=42)

### Creating Custom Dataset Class with Feature Engineering

In [13]:
class TextAndCategoricalDataset(Dataset):
    def __init__(self, texts, sources, labels, tokenizer, tfidf_vectorizer, max_length=512):
        self.texts = texts
        self.sources = sources
        self.labels = labels
        self.tokenizer = tokenizer
        self.tfidf_vectorizer = tfidf_vectorizer
        self.max_length = max_length

    def pos_tagging(self, text):
        words = word_tokenize(text)
        pos_tags = pos_tag(words)
        return [tag for (word, tag) in pos_tags]

    def named_entity_recognition(self, text):
        words = word_tokenize(text)
        pos_tags = pos_tag(words)
        tree = ne_chunk(pos_tags)
        entities = []
        for chunk in tree:
            if isinstance(chunk, Tree):
                entities.append(' '.join([token for token, tag in chunk.leaves()]))
        return entities

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        source = str(self.sources[idx])
        label = int(self.labels[idx])
        
        #feature Engineering
        # Perform Named Entity Recognition (NER)
        #entities = self.named_entity_recognition(text)

        # Perform POS tagging
        #pos_tags = self.pos_tagging(text)

        # Combine text, source, NER, POS tags, and TF-IDF features
        #print(entities,pos_tags,self.tfidf_vectorizer.fit_transform([text]))
        combined_text = f"text:{text} [SEP] source:{source}"

        # Tokenize combined text
        encoding = self.tokenizer(
            combined_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'encode_text':encoding,
            'labels': torch.tensor([label])
        }

### Creating BERT Model class

In [14]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, inputs):
        outputs = self.bert(**inputs.to(device))
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits
        #return torch.argmax(logits, axis=1).float()

In [15]:
#Training Loop
def train(model, data_loader, optimizer, device,epoch):
    model.train()
    losses = []
    for batch in data_loader:
        optimizer.zero_grad()
        inputs = batch['encode_text'].to(device)
        labels = batch['labels'].to(device)
        #inputs = {
        #    'input_ids': batch['input_ids'].to(device),
        #   'attention_mask': batch['attention_mask'].to(device),
        #}
        outputs = model(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"epoch {epoch}, loss: {sum(losses)/len(losses)}")

### Initializing Hyper Parameters

In [16]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 512
batch_size = 16
num_epochs = 7
learning_rate = 0.000009

In [17]:
# Defining evaluation loop
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['encode_text'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            _,preds = torch.max(outputs, dim=1)
            predictions.extend(preds.to(device).tolist())
            actual_labels.extend(labels.to(device).tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [18]:
def predict_sentiment(text, model, tokenizer, device, max_length=512):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

### Initializing Model, Optimizer, DataLoaders

In [19]:
# Initializing TF-IFD vectorizer for feature Engineering
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

In [20]:
#Initailizing Data Loader
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextAndCategoricalDataset(train_data['text'].tolist(),train_data['source'].tolist(), train_labels,tokenizer,tfidf_vectorizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TextAndCategoricalDataset(val_data['text'].tolist(),val_data['source'].tolist(), val_labels,tokenizer,tfidf_vectorizer)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [21]:
# Intializing BERT Model
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [22]:
# Initailizing Adamax Optimiser
from torch.optim import Adamax
optimizer = torch.optim.Adamax(model.parameters(), learning_rate)

### Training the Model

In [23]:
# Starting Training
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataset, optimizer, device,epoch + 1)
        accuracy, report = evaluate(model, val_dataset, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/7
epoch 1, loss: 0.1787932621262662
Validation Accuracy: 0.9091
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      4023
           1       0.85      0.99      0.92      3977

    accuracy                           0.91      8000
   macro avg       0.92      0.91      0.91      8000
weighted avg       0.92      0.91      0.91      8000

Epoch 2/7
epoch 2, loss: 0.08225797993359629
Validation Accuracy: 0.9459
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      4023
           1       0.91      0.99      0.95      3977

    accuracy                           0.95      8000
   macro avg       0.95      0.95      0.95      8000
weighted avg       0.95      0.95      0.95      8000

Epoch 3/7
epoch 3, loss: 0.054229181339622304
Validation Accuracy: 0.9107
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      4023
           1      

### Testing the model

In [24]:
# Testing on Dev dataset
dev_text=dev['text'].tolist()
dev_label=dev['label'].tolist()
dev_source=dev['source'].tolist()
dev_dataset = TextAndCategoricalDataset(dev_text,dev_source, dev_label,tokenizer,tfidf_vectorizer)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)
accuracy, report = evaluate(model, dev_dataset, device)
print(f"Dev Accuracy: {accuracy:.4f}")
print(report)

Dev Accuracy: 0.8268
              precision    recall  f1-score   support

           0       0.83      0.82      0.83      2500
           1       0.82      0.83      0.83      2500

    accuracy                           0.83      5000
   macro avg       0.83      0.83      0.83      5000
weighted avg       0.83      0.83      0.83      5000

