<a href="https://colab.research.google.com/github/Karampruthi/CE888-Data-Science-Decision-making/blob/main/bert_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
from textblob import TextBlob
import requests
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('stopwords')
nltk.download('words')
stop_words = stopwords.words('english')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


# Loading the Data

In [2]:
text = requests.get('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/train_text.txt').text
label = requests.get('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/train_labels.txt').text
val_text = requests.get('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/val_text.txt').text
val_label = requests.get('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/val_labels.txt').text
text_test = requests.get('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/test_text.txt').text
label_test = requests.get('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/test_labels.txt').text

# Text PreProcessing

In [3]:
def process(label,text):
    
    tag = []
    for sent in label.split("\n"):
        try:
            tag.append(int(sent))
        except ValueError:
            pass

    tweet = []
    for text in text.split('\n'):
        try:
            tweet.append(text)
        except ValueError:
            pass
        
    data = {'tweet':tweet[:-1],'tag':tag}
    df = pd.DataFrame(data)
    df['sentiment'] = df.tag.apply(lambda x:'Positive' if x==2 else 'Negative' if x==0 else 'Neutral')
    return df

In [4]:
df = process(label,text)
df_val = process(val_label,val_text)
df_test = process(label_test,text_test)

In [5]:
df

Unnamed: 0,tweet,tag,sentiment
0,"""QT @user In the original draft of the 7th boo...",2,Positive
1,"""Ben Smith / Smith (concussion) remains out of...",1,Neutral
2,Sorry bout the stream last night I crashed out...,1,Neutral
3,Chase Headley's RBI double in the 8th inning o...,1,Neutral
4,@user Alciato: Bee will invest 150 million in ...,2,Positive
...,...,...,...
45610,"@user \""""So amazing to have the beautiful Lady...",2,Positive
45611,"9 September has arrived, which means Apple's n...",2,Positive
45612,Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...,2,Positive
45613,@user no I'm in hilton head till the 8th lol g...,1,Neutral


## Text Cleaning

In [6]:
def cleaner(tweet):
    
    tweet = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", tweet)
    tweet = tweet.lower()   
    tweet = re.sub(r'\d+','', tweet) 
    # tweet = tweet.split()
    # tweet = " ".join([word for word in tweet if not word in stop_words])
    # tweet = " ".join([stemmer.stem(word) for word in tweet])

    # tweet = tweet.split()
    # tweet = " ".join(w for w in tweet if w in words or not w.isalpha())
    tweet = tweet.replace("user", "")

    return  tweet

In [7]:
def cleanup(df):
   
    train_cleaned = df['tweet'].apply(cleaner)
    df['tweet'] = train_cleaned.apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
  
    return df    

In [8]:
train_cleaned = cleanup(df)
val_cleaned = cleanup(df_val)
test_cleaned = cleanup(df_test)

lst = [train_cleaned, val_cleaned]
train_cleaned = pd.DataFrame(np.concatenate(lst),columns=val_cleaned.columns)

In [9]:
train_cleaned.tweet[0]

'qt in the original draft of the th book remus lupin survived the battle of hogwarts happybirthdayremuslupin'

In [10]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla V100-SXM2-16GB


In [11]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 13.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 19.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 50.5MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [12]:
from transformers import BertTokenizer, AutoModel, AutoTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Create a function to tokenize a set of texts
def preprocessing(data,length):
  
    input_ids = []
    attention_masks = []

    for sentence in data:
   
        encoded_sent = tokenizer.encode_plus(
            text=sentence,  
            add_special_tokens=True,        
            max_length=length,              
            pad_to_max_length=True,                  
            return_attention_mask=True      
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [13]:
tweets = np.concatenate([train_cleaned.tweet.values, test_cleaned.tweet.values])
tweets_encoded = [tokenizer.encode(sent, add_special_tokens=True, truncation =True) for sent in tweets]
max_length = max([len(sent) for sent in tweets_encoded])
print('Max length: ', max_length)

Max length:  51


In [14]:
data = [train_cleaned.tweet[0]]
token_ids = list(preprocessing(data,max_length)[0].squeeze().numpy())
print('Original: ', train_cleaned.tweet[0])
print('Token IDs: ', token_ids)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  qt in the original draft of the th book remus lupin survived the battle of hogwarts happybirthdayremuslupin
Token IDs:  [101, 1053, 2102, 1999, 1996, 2434, 4433, 1997, 1996, 16215, 2338, 2128, 7606, 11320, 8091, 5175, 1996, 2645, 1997, 27589, 18367, 2015, 3407, 17706, 2705, 10259, 28578, 2271, 7630, 8091, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [15]:
%%time
train_inputs, train_masks = preprocessing(train_cleaned.tweet.values,max_length)
val_inputs, val_masks = preprocessing(test_cleaned.tweet.values,max_length)

CPU times: user 31.3 s, sys: 139 ms, total: 31.4 s
Wall time: 31.4 s


In [16]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_labels = torch.tensor(train_cleaned['tag'])
val_labels = torch.tensor(test_cleaned['tag'])

batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [17]:
%%time

import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, freeze_bert=False):

        super(BertClassifier, self).__init__()
        inlet, outlet, b_out  = 768, 50, 3

        self.bert =  BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(inlet, outlet),
            nn.ReLU(),
            nn.Linear(outlet, b_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = True
        
    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)
    
        return logits

CPU times: user 167 µs, sys: 0 ns, total: 167 µs
Wall time: 171 µs


In [18]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):

    bert_classifier = BertClassifier()

    bert_classifier.to(device)

    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    
                      eps=1e-8   
                      )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, 
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [24]:
import random
import time
from sklearn.metrics import recall_score

loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):

    torch.cuda.empty_cache()
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        # accuracy = (preds == b_labels).cpu().numpy().mean() * 100

        accuracy = recall_score(b_labels.tolist(),preds.tolist(),average = 'macro')
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [25]:
def print_model_params(model):
  params = list(model.named_parameters())
  print('The BERT model has {:} different named parameters.\n'.format(len(params)))
  print('==== Embedding Layer ====\n')
  for p in params[0:5]:
      print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
  print('\n==== First Transformer ====\n')
  for p in params[5::]:
      print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
  print('\n==== Output Layer ====\n')
  for p in params[-4:]:
      print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [26]:
# print_model_params(bert_classifier)

In [28]:
set_seed(45)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=4)
train(bert_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.978573   |     -      |     -     |   3.87   
   1    |   40    |   0.829482   |     -      |     -     |   3.66   
   1    |   60    |   0.765776   |     -      |     -     |   3.65   
   1    |   80    |   0.732347   |     -      |     -     |   3.64   
   1    |   100   |   0.696742   |     -      |     -     |   3.66   
   1    |   120   |   0.684512   |     -      |     -     |   3.66   
   1    |   140   |   0.719244   |     -      |     -     |   3.67   
   1    |   160   |   0.713639   |     -      |     -     |   3.65   
   1    |   180   |   0.685825   |     -      |     -     |   3.65   
   1    |   200   |   0.696769   |     -      |     -     |   3.65   
   1    |   220   |   0.675370   |     -      |     -     |   3.65   
   1    |   240   |   0.716398   |     -      |     -     |   3.65   
