## Sentiment Classification with Huggingface+BERT fine-tuning

- We'll be using the IMDB dataset, which you need to [download](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) and extract, set the `data_base_folder` to the extracted folder

In [11]:
import os
import re
import numpy as np 
import pandas as pd
import shutil
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import transformers
from transformers import BertTokenizer, BertModel

import torch
from torch import cuda
from tqdm import tqdm_notebook as tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cuda'

In [12]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

def load_data(path):
    onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    print('found {} files'.format(len(onlyfiles)))
    all_text = []
    for f in onlyfiles:
        with open('{}/{}'.format(path, f)) as handle:
            lines = clean_text(handle.readlines()[0])
            all_text.append(lines)
        
    return all_text

def generate_truthfulness_2way(row):
    if row['target'] == True:
        if row['3_label_majority_answer'] == 'Agree':
            return "True"
        elif row['3_label_majority_answer'] == 'Disagree':
            return "False"
    else:
        if row['3_label_majority_answer'] == 'Agree':
            return "False"
        elif row['3_label_majority_answer'] == 'Disagree':
            return "True"
    return None

def load_twitter():
    DATASET_PATH = "Truth_Seeker_Model_Dataset.csv"
    df = pd.read_csv(DATASET_PATH)
    
    print('Number of training sentences: {:,}\n'.format(df.shape[0]))
    
    # Display 10 random rows from the data.
    df = df.sample(frac=1)

    # Drop the no majority
    df = df.drop('5_label_majority_answer', axis=1)
    #df = df.drop(df[df['3_label_majority_answer'] == 'NO MAJORITY'].index, axis=0)

    df2 = pd.DataFrame()
    df2['2-way-label'] = df.apply(lambda x: generate_truthfulness_2way(x), axis=1)

    df2['2-way-label'] = df2['2-way-label'].replace({'True': 0, 'False': 1})
    y = df2["2-way-label"].values
    X = df
    return (X,y)


In [13]:
#data_base_folder = 'aclImdb_v1/aclImdb'

In [14]:
X,y = load_twitter()
print(X)

#neg = load_data('{}/train/neg'.format(data_base_folder))
#pos = load_data('{}/train/pos'.format(data_base_folder))
#train_data = np.array(neg+pos)
#train_labels = np.array([[1,0]]*len(neg) + [[0,1]]*len(pos))
#train_data.shape, train_labels.shape

Number of training sentences: 134,198

        Unnamed: 0              author  \
85592        85592  Samantha Putterman   
121503      121503  Samantha Putterman   
38872        38872       Lauren Caruba   
97372        97372             Noah Y.   
132638      132638       Tom Kertscher   
...            ...                 ...   
29814        29814           Dan Clark   
38055        38055         Sean Nelson   
119427      119427      Ciara O'Rourke   
24766        24766       Tom Kertscher   
78929        78929         Andy Nguyen   

                                                statement  target  \
85592   The movie I am Legend was based in the year 20...   False   
121503  Elderly people were beat up by a BLM hate grou...   False   
38872   "More black babies are aborted in NYC than born."    True   
97372   Joe Biden is handing our power grid to the Chi...   False   
132638  No other country is having a second wave of CO...   False   
...                                       

  df2['2-way-label'] = df2['2-way-label'].replace({'True': 0, 'False': 1})


In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=.2, random_state=42)
X_train.shape,y_train.shape

((107358, 8), (107358,))

In [16]:
X_test.shape, y_test.shape

((26840, 8), (26840,))

### Torch Datasets

- takes in inputs and outputs/labels
- interfaces with tokenizer
- handles batching

In [17]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

### Bert Class

- first "layer" is a pre-trained BERT model
- you can add whatever layers you want after that

In [18]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
                   
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

### Helpful Functions

Loss

- This task is binary, so it uses binary crossentropy loss
- Tasks with more labels will use categorical crossentropy
- Tasks that don't have labels, but rather have distributions should use KL divergence
- Tasks that don't have distributions should use something like RMSE loss

Train

- Steps through the data batch by batch
- grabs ids, masks, and token_type_ids which are required inputs for BERT
- inputs are passed through the model, compared to targets, computes loss function, backprops

Validation

- Takes a model, passes inputs
- Need to use the targets from here because they are potentially shuffled!

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### The Tokenizer

- Converts a raw string to the ids, masks, and token_type_ids

In [None]:
# Define sentences

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# what does the tokenizer do?
print(X_train.iloc[5])

tokenizer.encode_plus(
            X_train.iloc[5],
            None,
            add_special_tokens=True,
            max_length=410,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )

Unnamed: 0                                                             47879
author                                                         Tom Kertscher
statement                       The Pfizer COVID-19 vaccine is not approved.
target                                                                 False
BinaryNumTarget                                                          0.0
manual_keywords                                 not approved,pfizer, vaccine
tweet                      @JAClementsTX @POTUS The Pfizer COVID-19 vacci...
3_label_majority_answer                                                Agree
Name: 47879, dtype: object




ValueError: Input Unnamed: 0                                                             47879
author                                                         Tom Kertscher
statement                       The Pfizer COVID-19 vaccine is not approved.
target                                                                 False
BinaryNumTarget                                                          0.0
manual_keywords                                 not approved,pfizer, vaccine
tweet                      @JAClementsTX @POTUS The Pfizer COVID-19 vacci...
3_label_majority_answer                                                Agree
Name: 47879, dtype: object is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

### Training setup

- hyperparameters
- setup dataset
- setup parameters
- setup dataloader

In [None]:
MAX_LEN = 410
BATCH_SIZE = 64
EPOCHS = 3
NUM_OUT = 2 # binary task
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(X_train, torch.from_numpy(y_train), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(X_test, torch.from_numpy(y_test), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

### Train,  Evaluate

- model.to -> send to GPU, if available (anything computed should be put onto the GPU)
- setup optimizer - could use Stochastic Gradient Descent, but ADAM tends to work better
- for each epoch, train, show the loss, evaluate on the test data

In [None]:
model = BERTClass(NUM_OUT)
model.to(device)    

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('accuracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))