### Part 1

# Imports

In [None]:
import re
from tqdm import tqdm
import seaborn as sns
from textblob import TextBlob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import nltk
from nltk.corpus import stopwords

# Handling the dataset

In [None]:
data=pd.read_csv('dataset.csv')

In [None]:
data=data.iloc[0:12290,:]

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data.isna().any()

# Building functions 

In [None]:
# function to obtain Subjectivity Score
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity

# function to obtain Polarity Score
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# function to obtain Sentiment category
def getSentimentTextBlob(polarity):
    if polarity < 0:
        return "Negative"
    elif polarity == 0:
        return "Neutral"
    else:
        return "Positive"

In [None]:
#nltk.download('stopwords')

In [None]:
def text_processing(s):
    s=s.lower()
    s = re.sub(r'[^\w\s]', '', s)
    s=re.sub('[0-9]+','',s)
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')])
    return s



In [None]:
data['tweet_processed']=data['tweetcaption'].apply(text_processing)

In [None]:
#find the maximum length
max_len = max([len(sent) for sent in data.tweet_processed])
print('Max length: ', max_len)

In [None]:
# creating columns for subjectivity,polarity and sentiment
data['Subjectivity']=data['tweet_processed'].apply(getSubjectivity)
data['Polarity']=data['tweet_processed'].apply(getPolarity)
data['Sentiment']=data['Polarity'].apply(getSentimentTextBlob)

In [None]:
data['Sentiment'].value_counts()

In [None]:
def sentimentscores(sentiment):
    if(sentiment=="Positive"):
        return 2
    elif(sentiment=="Negative"):
        return 1
    else:
        return 0

In [None]:
data['Sentiment_score']=data['Sentiment'].apply(sentimentscores)

In [None]:
data.head()

In [None]:
data['Sentiment_score'].unique()

In [None]:
#train test split
X_train, X_val, y_train, y_val = train_test_split(data.index.values, 
                                                   data.Sentiment_score.values,
                                                   test_size = 0.15,
                                                   random_state = 1,
                                                   stratify = data.Sentiment_score.values)

In [None]:
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

In [None]:
pip install transformers

Tokenization

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                         do_lower_case = True)

In [None]:
#encode train set
encoded_data_train = tokenizer.batch_encode_plus(data[data.data_type == 'train'].tweet_processed.values,
                                                add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                truncation=True,
                                                max_length = 256,
                                                return_tensors = 'pt')
                                                
#encode validation set
encoded_data_val = tokenizer.batch_encode_plus(data[data.data_type == 'val'].tweet_processed.values,
                                                add_special_tokens = True,
                                                return_attention_mask = True,
                                                truncation=True,
                                                pad_to_max_length = True,
                                                max_length = 256,
                                                return_tensors = 'pt')

In [None]:
#train set
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].Sentiment_score.values)

#validation set
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].Sentiment_score.values)

Setting up Bert pre-trained model

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = 3,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Create dataloaders

In [None]:

from torch.utils.data import TensorDataset

#train set
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

#validation set
dataset_val = TensorDataset(input_ids_val, 
                             attention_masks_val, 
                             labels_val)

In [None]:

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

#train set
dataloader_train = DataLoader(dataset_train,
                              sampler = RandomSampler(dataset_train),
                              batch_size = 32)

#validation set
dataloader_val = DataLoader(dataset_val,
                              sampler = RandomSampler(dataset_val),
                              batch_size = 32)

In [None]:

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                 lr = 1e-5,
                 eps = 1e-8) 
                 
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = len(dataloader_train)*epochs)

Evaluation

In [None]:
def evaluate(dataloader_val):

    #evaluation mode 
    model.eval()
    
    #tracking variables
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        #load into GPU
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        batch = tuple(b.to(device) for b in batch)
        
        #define inputs
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        #compute logits
        with torch.no_grad():        
            outputs = model(**inputs)
        
        #compute loss
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        #compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    #compute average loss
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds, average = 'weighted')

Training the model

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc = 'Epoch {:1d}'.format(epoch), 
                        leave = False, 
                        disable = False)
    
    for batch in progress_bar:
        
        model.zero_grad() #set gradient to 0
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids': batch[0], 
                  'attention_mask': batch[1], 
                  'labels': batch[2]}
        
        outputs = model(**inputs) 
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        
    torch.save(model.state_dict(), f'Models/ BERT_ft_epoch{epoch}.model')
    
    tqdm.write('\n Epoch {epoch}')
    
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write('Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

In [None]:
data=data.drop(['Subjectivity','Polarity','data_type'],axis=1)

In [None]:
data.head()