## Fine Tuning Bert Model

In [28]:
from transformers import BertTokenizer, BertModel,get_linear_schedule_with_warmup

In [29]:
import pandas as pd
import glob
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn, optim

import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from collections import defaultdict
from textwrap import wrap

from transformers import DataCollatorWithPadding

In [5]:
file_path = '../Data/Prepared/NormalAnalysis/'

In [6]:
files = glob.glob(f'{file_path}*.csv')

In [7]:
files

['../Data/Prepared/NormalAnalysis/fin_phrase_TF_IDF.csv',
 '../Data/Prepared/NormalAnalysis/fin_phrase_bank_clean.csv',
 '../Data/Prepared/NormalAnalysis/Tweet_train_clean.csv',
 '../Data/Prepared/NormalAnalysis/Tweet_valid_clean.csv',
 '../Data/Prepared/NormalAnalysis/fin_report_df.csv']

In [53]:
df_train = pd.read_csv('../Data/Prepared/NormalAnalysis/Tweet_train_clean.csv')

In [51]:
df_valid = pd.read_csv('../Data/Prepared/NormalAnalysis/Tweet_valid_clean.csv')

In [65]:
df_val, df_test = train_test_split(df_valid, test_size = 0.5, random_state = RANDOM_SEED)

In [54]:
df_train.head()

Unnamed: 0,text,label,clean_text,topic
0,Here are Thursday's biggest analyst calls: App...,0,thursday biggest analyst call apple amazon tes...,Analyst Update
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy la vega sand travel singapore build well f...,Analyst Update
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper sandler downgrade docusign sell citing e...,Analyst Update
3,"Analysts react to Tesla's latest earnings, bre...",0,analyst react tesla latest earnings break next...,Analyst Update
4,Netflix and its peers are set for a ‘return to...,0,netflix peer set return growth analyst say giv...,Analyst Update


In [73]:
class_names = sorted(df_train.label.unique())
class_names

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [56]:
df_train.label.value_counts()

label
2     3545
18    2118
14    1822
9     1557
5      987
16     985
1      837
19     823
7      624
6      524
15     501
17     495
12     487
13     471
4      359
3      321
0      255
8      166
10      69
11      44
Name: count, dtype: int64

In [57]:
sorted(df_train.topic.unique())

['Analyst Update',
 'Company | Product News',
 'Currencies',
 'Dividend',
 'Earnings',
 'Energy | Oil',
 'Fed | Central Banks',
 'Financials',
 'General News | Opinion',
 'Gold | Metals | Materials',
 'IPO',
 'Legal | Regulation',
 'M&A | Investments',
 'Macro',
 'Markets',
 'Personnel Change',
 'Politics',
 'Stock Commentary',
 'Stock Movement',
 'Treasuries | Corporate Debt']

In [60]:
class TextDataset(Dataset):
    def __init__(self, text, targets, tokenizer, max_len, include_raw_text=False):
        self.text = text
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.include_raw_text = include_raw_text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids = False,
            return_attention_mask = True,
            truncation = True,
            padding = True,
            return_tensors = 'pt',)

        output = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }
        if self.include_raw_text:
            output['text_text'] = text

        return output


In [63]:
class SentimentClassifierModel(nn.Module):
    """
    Sentiment classification model based on BERT.

    Args:
        n_classes (int): Number of classes for sentiment classification.

    """
    def __init__(self, n_classes):
        super(SentimentClassifierModel, self).__init__()
        self.bert = BertModel.from_pretrained(pre_trained_model_ckpt,return_dict=False)
        self.drop = nn.Dropout(p = 0.2)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        """
        Forward pass of the sentiment classifier model.

        Args:
            input_ids (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
            attention_mask (torch.Tensor): Attention mask tensor of shape (batch_size, sequence_length).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, n_classes).

        """
        _, pooled_output = self.bert(
            input_ids = input_ids,
            attention_mask= attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [64]:
def train_bert_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    """
    Trains the given model using the provided data loader and optimizer.

    Args:
        model (torch.nn.Module): Model to train.
        data_loader (DataLoader): DataLoader providing the training data.
        loss_fn: Loss function to optimize.
        optimizer: Optimizer for updating the model's parameters.
        device: Device to use for training.
        scheduler: Learning rate scheduler.
        n_examples (int): Total number of training examples.

    Returns:
        float: Accuracy of the model on the training data.
        float: Average training loss.

    """
    model=model.train()
    losses = []
    correct_predictions = 0

    """  iterate over the batches provided by the data loader. Within each iteration, the batch tensors are moved to the appropriate device. The model performs a forward pass on the input tensors, and the predicted labels are obtained by taking the maximum value along the appropriate dimension. """
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim = 1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets).cpu()

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions/n_examples, np.mean(losses)

In [107]:
def eval_bert_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            _,preds = torch.max(outputs, dim = 1)

            loss = loss_fn(outputs, targets.detach())
            correct_predictions += torch.sum(preds == targets).cpu()
            losses.append(loss.item())
    return correct_predictions/n_examples, np.mean(losses)

In [None]:
def get_bert_predictions(model, data_loader):
    model = model.eval()
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["clean_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim =1)
            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [34]:
pre_trained_model_ckpt = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pre_trained_model_ckpt)

vocab.txt: 100%|███████████████████████████████████████| 232k/232k [00:00<00:00, 1.28MB/s]
tokenizer_config.json: 100%|████████████████████████████| 48.0/48.0 [00:00<00:00, 122kB/s]
config.json: 100%|███████████████████████████████████████| 570/570 [00:00<00:00, 2.08MB/s]


In [36]:
RANDOM_SEED = 42

In [37]:
np.random.seed(RANDOM_SEED)

In [38]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x10ef95970>

In [39]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [91]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = 'longest' )

In [92]:
MAX_LEN = 512
BATCH_SIZE = 16

In [93]:
def create_data_loader(df, tokenizer, max_len = MAX_LEN, batch_size = BATCH_SIZE, include_raw_text = False ):
    ds = TextDataset(
        text=df.text.to_list(),
        targets = df.label.to_list(),
        tokenizer=tokenizer,
        max_len=max_len,
        include_raw_text=include_raw_text
    )
    return DataLoader(ds, batch_size=batch_size, collate_fn=collator )

In [94]:
train_data_loader = create_data_loader(df_train, tokenizer)

In [95]:
val_data_loader = create_data_loader(df_val, tokenizer)

In [96]:
test_data_loader = create_data_loader(df_test, tokenizer)

In [99]:
model = SentimentClassifierModel(len(class_names))
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [101]:
#Testing to see if the data loader works appropriately
data = next(iter(train_data_loader))

print(data)
print(data.keys()) # dict_keys(['input_ids', 'attention_mask', 'targets'])

print(data['input_ids'].shape) # torch.Size([16, 512])

print(data['attention_mask'].shape) # torch.Size([16, 512])

print(data['targets'].shape) # torch.Size([16])

{'input_ids': tensor([[  101,  2182,  2024,  9432,  1005,  1055,  5221, 12941,  4455,  1024,
          6207,  1010,  9733,  1010, 26060,  1010, 14412,  4630,  4313,  1010,
          9986,  2271, 23773,  1010,  4654, 22500,  1004, 23713,  1025,  2062,
         16770,  1024,  1013,  1013,  1056,  1012,  2522,  1013,  1053,  2361,
          2078,  2620,  2290, 13668,  2581, 27225,   102],
        [  101,  4965,  5869,  7136, 13457,  2004,  3604,  2000,  5264, 16473,
          1010,  7051, 23054,  2758, 16770,  1024,  1013,  1013,  1056,  1012,
          2522,  1013, 13109,  2015,  2475,  2860, 28311, 18682,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101, 11939,  5472,  3917,  2091, 24170,  2015,  9986,  2271, 23773,
          2000,  5271,  1010,  8951,  8319, 10831, 13463,  5766,  6653, 16770,
          1024,  1013,  1013,  1056,  1012,  2522,  1013,  1015,  6633,  3723,

In [102]:
# Just An evaluation run of the model
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
F.softmax(model(input_ids,attention_mask), dim = 1)

tensor([[0.0626, 0.0713, 0.0193, 0.0357, 0.0440, 0.0852, 0.0934, 0.0358, 0.0731,
         0.0717, 0.0382, 0.0383, 0.0344, 0.0798, 0.0509, 0.0348, 0.0349, 0.0146,
         0.0402, 0.0418],
        [0.0587, 0.0753, 0.0171, 0.0353, 0.0401, 0.0516, 0.0963, 0.0282, 0.0578,
         0.0536, 0.0790, 0.0528, 0.0379, 0.0915, 0.0812, 0.0233, 0.0287, 0.0135,
         0.0304, 0.0475],
        [0.0435, 0.0732, 0.0193, 0.0404, 0.0721, 0.0737, 0.0729, 0.0320, 0.0566,
         0.0648, 0.0536, 0.0423, 0.0295, 0.0817, 0.0736, 0.0370, 0.0478, 0.0144,
         0.0400, 0.0317],
        [0.0682, 0.0451, 0.0163, 0.0305, 0.0631, 0.1142, 0.0844, 0.0364, 0.0565,
         0.0635, 0.0515, 0.0416, 0.0324, 0.0840, 0.0515, 0.0333, 0.0302, 0.0157,
         0.0375, 0.0443],
        [0.0542, 0.0610, 0.0302, 0.0420, 0.0494, 0.0644, 0.0782, 0.0342, 0.0490,
         0.0644, 0.0520, 0.0484, 0.0353, 0.0771, 0.0597, 0.0300, 0.0527, 0.0213,
         0.0500, 0.0466],
        [0.0703, 0.0648, 0.0215, 0.0494, 0.0443, 0.0441, 0.0

In [105]:
EPOCHS = 2

optimizer = optim.AdamW(model.parameters(), lr= 1e-5)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0, 
    num_training_steps=total_steps)

# For multi-class classification you would usually just use nn.CrossEntropyLoss 
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/ {EPOCHS}')
    print('-'*15)
    train_acc, train_loss = train_bert_model(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_bert_model(model, val_data_loader, loss_fn, device, len(df_val))
    print(f'Val loss {val_loss} accuracy {val_acc}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
  
    if val_acc>best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc


Epoch 1/ 2
---------------
