# **Sentiment Analysis with Deep Learning using BERT**


## **What is BERT?**

BERT is a large-scale transformer-based Language Model that can be finetuned for a variety of tasks.

For more information, the original paper can be found here (https://arxiv.org/abs/1810.04805).

HuggingFace documentation (https://huggingface.co/transformers/model_doc/bert.html)

## 1: Exploratory Data Analysis and Preprocessing

In [None]:
import torch
from tqdm.notebook import tqdm
import numpy as np 
import pandas as pd

In [49]:
df = pd.read_csv("/content/TextVsLabel.csv")


In [50]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label_bias
0,0,"""Orange Is the New Black"" star Yael Stone is r...",Non-biased
1,1,"""We have one beautiful law,"" Trump recently sa...",Biased
2,2,"...immigrants as criminals and eugenics, all o...",Biased
3,3,...we sounded the alarm in the early months of...,Biased
4,4,[Black Lives Matter] is essentially a non-fals...,Biased


In [51]:
#df.columns = ["index","text","sentiment"]

In [52]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [53]:
df.label_bias.value_counts()

Non-biased    1863
Biased        1810
Name: label_bias, dtype: int64

In [None]:
#df = df[df.sentiment.isin(['happy', 'not-relevant', 'angry', 'surprise', 'sad', 'disgust'])]

In [54]:
df.label_bias.value_counts()

Non-biased    1863
Biased        1810
Name: label_bias, dtype: int64

In [55]:
possible_labels = df.label_bias.unique()

In [56]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [57]:
label_dict

{'Non-biased': 0, 'Biased': 1}

In [58]:
df.label_bias = df['label_bias'].map(label_dict)

In [59]:
df.head(10)

Unnamed: 0,text,label_bias
0,"""Orange Is the New Black"" star Yael Stone is r...",0
1,"""We have one beautiful law,"" Trump recently sa...",1
2,"...immigrants as criminals and eugenics, all o...",1
3,...we sounded the alarm in the early months of...,1
4,[Black Lives Matter] is essentially a non-fals...,1
5,[Democrats employ] their full arsenal to deleg...,1
6,[Newsoms's] obsession with masks has created a...,1
7,[Newsoms's] onslaught of propaganda ignores co...,1
8,[The police] now prefer to think of themselves...,1
9,‘A new low’: Washington Post media critic blow...,1


In [60]:
#df.drop("index", inplace = True , axis = 1)

In [None]:
df.head(2)

Unnamed: 0,text,sentiment
0,Dorian Gray with Rainbow Scarf #LoveWins (from...,0
1,@SelectShowcase @Tate_StIves ... Replace with ...,0


Classes are imbalanced as visible

## 2: Training/Validation Split

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label_bias.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.label_bias.values)

In [63]:
df['data_type'] = ['not_set']*df.shape[0]

In [64]:
df.head()


Unnamed: 0,text,label_bias,data_type
0,"""Orange Is the New Black"" star Yael Stone is r...",0,not_set
1,"""We have one beautiful law,"" Trump recently sa...",1,not_set
2,"...immigrants as criminals and eugenics, all o...",1,not_set
3,...we sounded the alarm in the early months of...,1,not_set
4,[Black Lives Matter] is essentially a non-fals...,1,not_set


In [65]:
df.shape

(3673, 3)

In [66]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [67]:
df.head(10)

Unnamed: 0,text,label_bias,data_type
0,"""Orange Is the New Black"" star Yael Stone is r...",0,train
1,"""We have one beautiful law,"" Trump recently sa...",1,train
2,"...immigrants as criminals and eugenics, all o...",1,val
3,...we sounded the alarm in the early months of...,1,train
4,[Black Lives Matter] is essentially a non-fals...,1,train
5,[Democrats employ] their full arsenal to deleg...,1,train
6,[Newsoms's] obsession with masks has created a...,1,train
7,[Newsoms's] onslaught of propaganda ignores co...,1,train
8,[The police] now prefer to think of themselves...,1,train
9,‘A new low’: Washington Post media critic blow...,1,train


In [68]:
df.groupby(['label_bias', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label_bias,data_type,Unnamed: 2_level_1
0,train,1584
0,val,279
1,train,1538
1,val,272


# 3. Loading Tokenizer and Encoding our Data

In [69]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [70]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [71]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', #bert-large-uncased
    do_lower_case=True
)

In [73]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,#get all the rows used for training 
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label_bias.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label_bias.values)

In [74]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

In [None]:
len(dataset_train)

1258

In [75]:
dataset_val.tensors

(tensor([[ 101, 1012, 1012,  ...,    0,    0,    0],
         [ 101, 1520, 1996,  ...,    0,    0,    0],
         [ 101, 1037, 3438,  ...,    0,    0,    0],
         ...,
         [ 101, 2017, 2089,  ...,    0,    0,    0],
         [ 101, 2017, 2342,  ...,    0,    0,    0],
         [ 101, 2402, 2317,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
         0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
         1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
         0, 1, 1, 0, 0, 0, 1, 1, 1,

# 4. Setting up BERT Pretrained Model

In [76]:
from transformers import BertForSequenceClassification

In [77]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# 5. Creating Data Loaders

In [78]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [79]:
dataset_train

<torch.utils.data.dataset.TensorDataset at 0x7f88f0569fa0>

In [80]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

# 6. Setting Up Optimizer and Scheduler

In [81]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [82]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [83]:
epochs = 1

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

# 7. Defining our Performance Metrics

In [84]:
import numpy as np
from sklearn.metrics import f1_score

In [85]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [86]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

# 8. Creating our Training Loop

In [87]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [89]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [90]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    torch.save(model.state_dict(), f'BERT_ft_Epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

torch.save(model,'BertModel')

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/781 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.5426440585533422


  0%|          | 0/18 [00:00<?, ?it/s]

Validation loss: 0.5494070814715492
F1 Score (weighted): 0.7504648320308159


# 9. Evaluating our Model

In [91]:
accuracy_per_class(predictions, true_vals)

Class: Non-biased
Accuracy:238/279

Class: Biased
Accuracy:177/272

