<a href="https://colab.research.google.com/github/MiHarsh/CodaLab-SharedTask/blob/main/Roberta_for_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Firstly, lets install transformer package

In [1]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-b832e65e-0923-03d7-b156-c1ae5f781d9f)


In [2]:
!pip install transformers --quiet

[K     |████████████████████████████████| 1.3MB 8.4MB/s 
[K     |████████████████████████████████| 890kB 18.9MB/s 
[K     |████████████████████████████████| 2.9MB 61.0MB/s 
[K     |████████████████████████████████| 1.1MB 54.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


### Import required libraries





In [3]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import re
!pip install emoji --quiet
import emoji
!pip install contractions --quiet
import contractions
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import unicodedata


from transformers import RobertaTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random
import plotly.express as px
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

[?25l[K     |██████▍                         | 10kB 26.7MB/s eta 0:00:01[K     |████████████▉                   | 20kB 4.6MB/s eta 0:00:01[K     |███████████████████▎            | 30kB 5.7MB/s eta 0:00:01[K     |█████████████████████████▊      | 40kB 4.8MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.7MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 317kB 10.6MB/s 
[K     |████████████████████████████████| 245kB 26.0MB/s 
[?25h  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Check out for Gpu

In [6]:
if torch.cuda.is_available():    
  device = torch.device("cuda")
  print('The GPU we use is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

The GPU we use is: Tesla P100-PCIE-16GB


#Data and preprocessing

In [7]:
train_datapath="https://raw.githubusercontent.com/MiHarsh/MiHarsh/master/Constraint_English_Train%20-%20Sheet1.csv"
val_datapath  ="https://raw.githubusercontent.com/MiHarsh/MiHarsh/master/Constraint_English_Val%20-%20Sheet1.csv"
train         = pd.read_csv(train_datapath)
valid         = pd.read_csv(val_datapath)
total         = pd.concat([train,valid],ignore_index=True)
mix           = total.iloc[:,1:]
le            = LabelEncoder()
mix['label']  = le.fit_transform(mix['label'])
mix

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,1
1,States reported 1121 deaths a small rise from ...,1
2,Politically Correct Woman (Almost) Uses Pandem...,0
3,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,Populous states can generate large case counts...,1
...,...,...
8555,Donald Trump wrongly claimed that New Zealand ...,0
8556,Current understanding is #COVID19 spreads most...,1
8557,Nothing screams “I am sat around doing fuck al...,0
8558,Birx says COVID-19 outbreak not under control ...,0


### Data Cleaning

In [8]:
def cleaning(text):
  text= text.lower()
  text= emoji.demojize(text)
  text=contractions.fix(text)
  text=text.strip()
  text=text.replace('[^\w\s]','')
  text=re.sub(r'http\S+', '', text)
  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
  BAD_SYMBOLS_RE = re.compile('[^0-9a-z +]')
  text = REPLACE_BY_SPACE_RE.sub(' ' , text)
  text = BAD_SYMBOLS_RE.sub(' ',text)
  
  return text

clean=mix['tweet'].apply(cleaning)
STOPWORDS = set(stopwords.words('english'))

ff=[]
for i in clean:
  text=unicodedata.normalize('NFKD', i).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  ff.append(text)
dd=pd.DataFrame(ff)
dataset = pd.concat([dd,mix['label']],axis=1)
dataset

Unnamed: 0,0,label
0,the cdc currently reports 99031 deaths in gen...,1
1,states reported 1121 deaths a small rise from ...,1
2,politically correct woman almost uses pandem...,0
3,indiafightscorona we have 1524 covid testin...,1
4,populous states can generate large case counts...,1
...,...,...
8555,donald trump wrongly claimed that new zealand ...,0
8556,current understanding is covid19 spreads most...,1
8557,nothing screams i am sat around doing fuck al...,0
8558,birx says covid 19 outbreak not under control ...,0


### Lets tokenize the sentences using RobertaTokenizer and map those tokens to unique ID's
 


In [9]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)          #for every pre-trained model, it has its own tokenizer 
train_input_tokens = []
valid_input_tokens = []

train_dataset      = dataset[:len(train)]
valid_dataset      = dataset[len(train):]

for sent in train_dataset[0]:                                                                   #Special Tokens are set True to identify the start and end of sentences.
    encoded_sent = tokenizer.encode(sent,add_special_tokens = True,add_prefix_space=True) #It is necessary in case of Roberta to add prefix space.
    train_input_tokens.append(encoded_sent)

for sent in valid_dataset[0]:                                                                   #Special Tokens are set True to identify the start and end of sentences.
    encoded_sent = tokenizer.encode(sent,add_special_tokens = True,add_prefix_space=True) #It is necessary in case of Roberta to add prefix space.
    valid_input_tokens.append(encoded_sent)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




Token indices sequence length is longer than the specified maximum sequence length for this model (1973 > 512). Running this sequence through the model will result in indexing errors


### Lets pad input tokens with 0 value and also truncate the length of sentences to 100 tokens


In [10]:
MAX_LEN = 40
train_input_ids = pad_sequences(train_input_tokens, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
valid_input_ids = pad_sequences(valid_input_tokens, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

### Attention masks are created to distinguish the actual token Id and padded Id.



In [11]:
train_attention_masks = []

for sent in train_input_ids:
    mask = [int(token_id > 0) for token_id in sent] #If a token ID is 0, then it's padding, set the mask to 0 else 1
    train_attention_masks.append(mask)

valid_attention_masks = []

for sent in valid_input_ids:
    mask = [int(token_id > 0) for token_id in sent] #If a token ID is 0, then it's padding, set the mask to 0 else 1
    valid_attention_masks.append(mask)

### Convert to tensors

In [12]:
train_inputs = torch.tensor(train_input_ids)
validation_inputs = torch.tensor(valid_input_ids)
train_labels = torch.tensor(train_dataset.label.values)
validation_labels = torch.tensor(valid_dataset.label.values)
train_masks = torch.tensor(train_attention_masks)
validation_masks = torch.tensor(valid_attention_masks)

### Loading RobertaForSequenceClassification

In [13]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base",num_labels = 2,output_attentions = False,output_hidden_states = False)
                                                            #Number of Labels is set to 2 ( Informative and Uninformative )

model.cuda();

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [14]:
#Batch Size is set to 32 
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size,)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data,  batch_size=batch_size)

### Optimization using AdamW

In [15]:
optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
epochs    = 8
total_steps = len(train_dataloader) * epochs                                                                #Total number of training steps is number of batches * number of epochs.
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

In [16]:
#for noting the time of execution
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

### For reproducibility

In [17]:
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [18]:
def train_one_epoch(model,train_dataloader,optimizer,scheduler):
    model.train()
    torch.set_grad_enabled(True)
    progress_bar = tqdm(train_dataloader)
    train_loss=[]
    train_labels=[]
    pred_labels=[]
    for batch in progress_bar:
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda()

        optimizer.zero_grad()
        outputs = model(b_input_ids, 
                    token_type_ids=None,                                          #Token_type_Ids are not considered in case of Distilbert
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        loss = outputs[0]
        loss.backward()                                                             # Perform a backward pass to calculate the gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)                     # Update parameters
        optimizer.step()
        scheduler.step()
        train_loss.append(loss.item())
        logits = outputs[1]                                                       # The "logits" are the output to the model ; values prior to applying an activation function like the softmax.
        logits = logits.detach().cpu().numpy()
        pred_labels.extend(np.argmax(logits, axis=1).reshape(-1,))
        train_labels.extend(b_labels.to('cpu').numpy().reshape(-1,))

    true_labels = np.array(train_labels)
    pred_labels = np.array(pred_labels) 
    train_f1    = f1_score(true_labels,pred_labels)
    train_acc   = np.mean(true_labels == pred_labels)                   
    return np.mean(train_loss),train_acc,train_f1


In [19]:
def valid_one_epoch(model,valid_dataloader):
    model.eval()
    torch.set_grad_enabled(False)
    progress_bar = tqdm(valid_dataloader)
    valid_loss=[]
    val_labels=[]
    pred_labels=[]
    for batch in progress_bar:
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda()

        outputs = model(b_input_ids, 
                    token_type_ids=None,                                          #Token_type_Ids are not considered in case of Distilbert
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        loss = outputs[0]
        valid_loss.append(loss.item())
        logits = outputs[1]                                                       # The "logits" are the output to the model ; values prior to applying an activation function like the softmax.
        logits = logits.detach().cpu().numpy()
        pred_labels.extend(np.argmax(logits, axis=1).reshape(-1,))
        val_labels.extend(b_labels.to('cpu').numpy().reshape(-1,))
    true_labels = np.array(val_labels)
    pred_labels = np.array(pred_labels)

    val_f1      = f1_score(true_labels,pred_labels)
    val_acc     = np.mean(true_labels == pred_labels)
    return np.mean(valid_loss),val_acc,val_f1

### Training

In [20]:
train_loss_values = []
valid_loss_values = []                                                                              
best_val_loss     = np.Inf

for epoch_i in range(0, epochs):                                                  #for every epoch
    
    t0 = time.time()
    train_loss,train_acc,train_f1 = train_one_epoch(model,train_dataloader,optimizer,scheduler)
    valid_loss,val_acc,val_f1     = valid_one_epoch(model,validation_dataloader)
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print("  Average training loss   : {0:.6f}".format(train_loss))
    print("  Average validation loss : {0:.6f}".format(valid_loss))
    print("  Training f1_score       : {0:.6f}".format(train_f1))
    print("  Validation f1_score     : {0:.6f}".format(val_f1))
    print("  Training acc_score      : {0:.6f}".format(train_acc))
    print("  Validation acc_score    : {0:.6f}".format(val_acc))
    print("  Epoch took: {:}".format(format_time(time.time() - t0)))

    if best_val_loss > valid_loss:
        best_val_loss = valid_loss
        torch.save(model.state_dict(),"CodaLab_best_loss.pth")
    print()

    train_loss_values.append(train_loss)
    valid_loss_values.append(valid_loss)
print("Training complete!")

HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.384801
  Average validation loss : 0.223075
  Training f1_score       : 0.847183
  Validation f1_score     : 0.922667
  Training acc_score      : 0.832710
  Validation acc_score    : 0.918692
  Epoch took: 0:00:36



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.224865
  Average validation loss : 0.248310
  Training f1_score       : 0.926114
  Validation f1_score     : 0.925973
  Training acc_score      : 0.923053
  Validation acc_score    : 0.919159
  Epoch took: 0:00:36



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.148361
  Average validation loss : 0.144472
  Training f1_score       : 0.951896
  Validation f1_score     : 0.951351
  Training acc_score      : 0.950000
  Validation acc_score    : 0.949533
  Epoch took: 0:00:36



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.098593
  Average validation loss : 0.177115
  Training f1_score       : 0.973457
  Validation f1_score     : 0.955171
  Training acc_score      : 0.972274
  Validation acc_score    : 0.952804
  Epoch took: 0:00:36



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.080457
  Average validation loss : 0.185957
  Training f1_score       : 0.979378
  Validation f1_score     : 0.959175
  Training acc_score      : 0.978505
  Validation acc_score    : 0.957477
  Epoch took: 0:00:35



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.058361
  Average validation loss : 0.240753
  Training f1_score       : 0.986289
  Validation f1_score     : 0.951676
  Training acc_score      : 0.985670
  Validation acc_score    : 0.948131
  Epoch took: 0:00:35



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.036717
  Average validation loss : 0.292067
  Training f1_score       : 0.990916
  Validation f1_score     : 0.950452
  Training acc_score      : 0.990498
  Validation acc_score    : 0.946262
  Epoch took: 0:00:35



HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


  Average training loss   : 0.039830
  Average validation loss : 0.254324
  Training f1_score       : 0.991519
  Validation f1_score     : 0.952296
  Training acc_score      : 0.991121
  Validation acc_score    : 0.950000
  Epoch took: 0:00:35

Training complete!


### Visualization of training loss of model

In [21]:
f = pd.DataFrame(train_loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',xaxis_title='Epoch',yaxis_title='Loss')
fig.show()

In [22]:
f = pd.DataFrame(valid_loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Validation loss of the Model',xaxis_title='Epoch',yaxis_title='Loss')
fig.show()