In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Install hugging face transformers api
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 35.8 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 71.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


# Import all necessary libraries

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, random_split
from torch.optim import AdamW, RAdam
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from transformers import XLMRobertaModel, XLMRobertaTokenizer, AutoTokenizer, XLMRobertaForSequenceClassification, get_linear_schedule_with_warmup
from tensorflow import keras
import tensorflow as tf
from gensim.parsing.preprocessing import remove_stopwords
import warnings

df = pd.read_csv('gdrive/My Drive/Data/IMDB-dataset/IMDB Dataset.csv')

# Shrink the data size due to memory issues

In [None]:
df = df[:15000]

# We encode the labels to 0s and 1s

In [None]:
lab_encoder = LabelEncoder()
df['sentiment'] = lab_encoder.fit_transform(df['sentiment'])

reviews = df['review'].values
labels = df['sentiment'].values

### Next, we need to tokenize each review using XLM-RoBERTa's tokenizer, as we're using XLM-RoBERTa for our zero-shot cross-lingual sentiment classification.
### This tokenizer's encoding includes the following steps:
  #### 1. Tokenize the whole sentence
  #### 2. Add special tokens [CLS] and [SEP]
  #### 3. Create a mapping between each token and its ID (accessible from the dictionary by the 'input_ids' key)
  #### 4. Pad each sentence to the max length of sentences
  #### 5. Create an attention mask for each [PAD] token, as they should be ignored during the attention process

In [None]:
# weird error caused when using XLMRobertaTokenizer, so I used AutoTokenizer
model_class, tokenizer_class, weights = (XLMRobertaModel, AutoTokenizer, 'xlm-roberta-base')

# load the pretrained weights onto the model and tokenizer
model = model_class.from_pretrained(weights)
tokenizer = tokenizer_class.from_pretrained(weights, do_lower_case=True)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

# We do a little preprocessing
### Other preprocessing attempts such as lemmatizing and removing stop words caused in poorer performance of all models, so I only removed html tags and special characters and digits

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from bs4 import BeautifulSoup

# removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# define a function to strip html tags and remove every character except alphabeticals
def process_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z]+',' ',text)
    return text

# apply preprocessing to all reviews
reviews = np.array(list(map(process_text, reviews)))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# define a function to tokenize a piece of text
def get_ids_and_masks(text, tokenizer):

  # we tokenize texts using encode_plus
  tokenized_text = tokenizer.encode_plus(text, add_special_tokens=True, 
                                            padding='max_length',
                                            max_length=256,
                                            truncation=True,
                                            return_tensors='pt',
                                            return_attention_mask=True)
  
  # return input ids and attention masks for a piece of text
  return (tokenized_text['input_ids'], tokenized_text['attention_mask'])

In [None]:
# list of ids and masks, for our whole dataset
input_ids, attention_mask = list(), list()

# tokenize each review using our function
for review in reviews:
  ids, masks = get_ids_and_masks(review, tokenizer)

  # append to our list of ids and masks
  input_ids.append(ids)
  attention_mask.append(masks)

# Create tensors for input_ids, attention_masks and labels for the whole dataset

In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_mask, dim=0)
labels = torch.tensor(labels)

## Use DataLoader to load data and iterate through it 

In [None]:
# specify a batch size
batch_size=16

# first, create a dataset from our tensors
all_data = TensorDataset(input_ids, attention_masks, labels)

# create a dataloader with a random sampler to choose batches randomly
dataloader = DataLoader(all_data, sampler=RandomSampler(all_data), batch_size=batch_size)

# Check for GPU

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# run the model on GPU
model.cuda()

## Iterate through our data and extract features for our samples

In [None]:
# a function to extract features of samples in a given dataloader
def extract_features(dataloader, model):  
  
  features, labels = list(), list()

  # iterate through data
  for _, batch in enumerate(dataloader):

    # copy our tensors to GPU, because our model is run on GPU
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # disable gradient calculation
    with torch.no_grad():

      # save last hidden state of BERT for each input
      batch_last_hidden_state = model(b_input_ids, 
                                      token_type_ids=None, 
                                      attention_mask=b_input_mask)
      
      # we only need the sentence embeddings ([CLS] tokens) of our inputs, so we slice our tensors and update the features list
      features += list(batch_last_hidden_state[0][:, 0, :].cpu().numpy())
      # update the labels list
      labels += list(b_labels.cpu().numpy())
  
  return (features, labels) 

In [None]:
# get features and labels 
features, labels = extract_features(dataloader, model)

# split our data into train and test sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42, test_size=0.2)

In [None]:
# convert train and test sets from list to numpy array
train_X = np.array(train_features)
train_y = np.array(train_labels)

test_X = np.array(test_features)
test_y = np.array(test_labels)

# We will use different approaches for our classification purpose:
  ### 1. a deep keras model
  ### 2. logistic regression using sklearn
  ### 3. using XLMRobertaForSequenceClassification directly

# First approach, using a deep keras model

### It was really difficult to find appropriate hyperparameters for this deep model

In [None]:
# a very deep model. I really tried to encourage the model to overfit a little and tried many many different hyperparameters and different number of layers.
# at the end, this model scored best on the training set, but the scores on test and val sets still weren't so different
deep_model = keras.Sequential([
    keras.layers.Dense(1024, input_shape=(768,)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(512),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(128),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(64),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

# reduce learning rate gradually
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=5,
                                  verbose=1,
                                  min_delta=0.0001)
# define early stopping
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                          min_delta=0,
                          patience=15,
                          verbose=1,
                          restore_best_weights=True)

# define callbacks
callbacks = [earlystop, reduce_lr]

# set optimizer, loss and metric for our model
deep_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.00002),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# train our model
deep_model.fit(train_X, train_y, epochs=100, batch_size=16, callbacks=callbacks, validation_split=0.1)
# evaluate model's performence on test set
test_loss, test_acc = deep_model.evaluate(test_X, test_y)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 20: ReduceLROnPlateau reducing learning rate to 3.999999898951501e-06.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 26: ReduceLROnPlateau reducing learning rate to 7.999999979801942e-07.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 31: ReduceLROnPlateau reducing learning rate to 1.600000018697756e-07.
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

Epoch 36: ReduceLROnPlateau reducing learning rate to 3.199999980552093e-08.
Epoch 36: early stopping
Test accuracy: 0.8669999837875366
Test loss: 0.3291410803794861


### We reached the accuracy of **0.866** on the test set

---


# In the second approach, we will use a logistic regression classifier
### We find the best parameters for this model using grid search

In [None]:
# because grid search usually raises warnings
warnings.filterwarnings('ignore')

# our candidate parameters
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

# define a grid search object
clf = GridSearchCV(LogisticRegression(),
                   param_grid=parameters,
                   scoring='accuracy',
                   cv=10)

# train classifiers
clf.fit(train_features, train_labels)

print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy:",clf.best_score_)

## Now, we create the best model using the hyperparameters that we found

In [None]:
# best classifier 
best_clf = LogisticRegression(penalty='l2', C=10.0, solver='newton-cg')

best_clf.fit(train_features, train_labels)
best_clf.score(test_features, test_labels)

0.86

### **0.86** accuracy on the test set
---
## In our final approach, we use the XLMRobertaForSequenceClassification, which basically is a XLMRobertaModel with a classifier layer on top

In [None]:
# the weights are the same as XLMRoberta, we just define number of labels which is 2 in our case (Binary Classification)
model_3 = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base',
                                                              num_labels=2,
                                                              output_hidden_states=False,    # we're not gonna need hidden states or attentions
                                                              output_attentions=False)       # because the model is gonna do the classification itself
# run the model on GPU
model_3.cuda()

## Unlike previous models, we split the data to train and test before feeding it to BERT 

In [None]:
# we partition our data in 3 sets, train, validation and test
train_size = int(len(all_data) * 0.7)
val_size = int(len(all_data) * 0.1)
test_size = int(len(all_data) * 0.2)

train, val, test = random_split(all_data, [train_size, val_size, test_size])

## We define the number of epochs and a learning rate
## As in BERT's paper, the recommended values for learning rate, batch size and number of epochs are:

*   ### number of epochs: from 2 to 4
*   ### learning rate: 2e-5, 3e-5, 5e-5
*   ### batch size: 16, 32 









In [None]:
# selected values
n_epochs = 4
learning_rate = 2e-5
batch_size = 16

# we define a dataloader for each set
train_dataloader = DataLoader(train, sampler=RandomSampler(train), batch_size=batch_size)
val_dataloader = DataLoader(val, sampler=RandomSampler(val), batch_size=batch_size)
test_dataloader = DataLoader(test, sampler=RandomSampler(test), batch_size=batch_size)

# define an optimizer with selected learning rate value
optimizer = AdamW(params = model_3.parameters(),    
                  lr = learning_rate)

# set a linear scheduler with 0 warmup steps
lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                               num_warmup_steps = 0,
                                               num_training_steps = len(train_dataloader) * n_epochs)

## We define a function to calculate accuracy

In [None]:
def calculate_accuracy(predictions, labels):
    # select the higher value in each row of the data and flatten into a numpy array
    pred_flat = np.argmax(predictions, axis=1).flatten()
    # flatten labels into a numpy array
    labels_flat = labels.flatten()
    # calculate accuracy
    accuracy = np.sum(pred_flat == labels_flat) / len(labels_flat) 
    
    return accuracy 

# Training phase
#### takes about 40 minutes

In [None]:
for i in range(n_epochs):

  # values used to calculate average loss and accuracy at the end of each epoch
  train_loss_per_epoch = 0
  val_loss_per_epoch = 0
  val_accuracy_per_epoch = 0

  # set the model on training mode. this DOES NOT perform a training step.
  # we do this because some layers behave differently during train phase and validation phase
  model_3.train()

  print("Epoch ", i+1)
  print()

  # iterate through all batches in training set
  for _, batch in enumerate(train_dataloader):

    # copy tensors to GPU, as our model is run on GPU
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # for our purpose, we need to set all previously calculated values to zero.
    # the default action is to sum all previous gradients in the loss.backward() step,
    # so we need clear values using zero_grad()
    model_3.zero_grad()

    # fit the model on the batch.
    # this phase contains the following steps:
    # input -----> fed into BERT -----> output (embeddings) -----> fed into classifier layer on top -----> output (predictions and loss)
    outputs = model_3(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask, 
                           labels=b_labels)  
    
    loss, logits = outputs[:2]

    # sum the loss for all batches
    train_loss_per_epoch += loss.item()      

    # calculate gradients
    loss.backward()

    # update parameters
    optimizer.step()

    # update learning rate
    lr_scheduler.step()

  # calculate average of the loss in this epoch
  loss_avg_train = train_loss_per_epoch/len(train_dataloader)

  print("Average train loss in epoch {:}: {:}\n".format(i+1, loss_avg_train))


  # set the model on evaluation mode
  model_3.eval()

  # iterate through all batches in validation set
  # the rest is similar to the training phase, but we don't do back propagation
  for _, batch in enumerate(val_dataloader):

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # no need to keep track of gradients in validation
    with torch.no_grad():        
      outputs = model_3(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask,
                             labels=b_labels)
      loss, logits = outputs[:2]

    val_loss_per_epoch += loss.item()

    # to convert these values to numpy arrays, they need to be on the cpu
    logits = logits.cpu().numpy()
    label_ids = b_labels.cpu().numpy()

    # calculate accuracy using our pre-defined function 
    val_accuracy_per_epoch += calculate_accuracy(logits, label_ids)
  
  loss_avg_val = val_loss_per_epoch/len(val_dataloader)
  accuracy_avg_val = val_accuracy_per_epoch/len(val_dataloader)

  print("Average val loss in epoch {:}: {:}\n".format(i+1, loss_avg_val))
  print("Average val accuracy in epoch {:}: {:}\n".format(i+1, accuracy_avg_val))
  print()

Epoch  1

Average train loss in epoch 1: 0.3432498910383547

Average val loss in epoch 1: 0.31108057681233325

Average val accuracy in epoch 1: 0.8865248226950354


Epoch  2

Average train loss in epoch 2: 0.19583959212949106

Average val loss in epoch 2: 0.27106292504183155

Average val accuracy in epoch 2: 0.8958333333333333


Epoch  3

Average train loss in epoch 3: 0.11932560562509108

Average val loss in epoch 3: 0.3179190177619061

Average val accuracy in epoch 3: 0.8980496453900709


Epoch  4

Average train loss in epoch 4: 0.06959772049633664

Average val loss in epoch 4: 0.3673003558218083

Average val accuracy in epoch 4: 0.8927304964539008




# Time to test the model!

In [None]:
# set the model on evaluation mode
model_3.eval()

predictions , true_labels = list(), list()

test_accuracy = 0

# similar to training
for _, batch in enumerate(test_dataloader):
  
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)

  with torch.no_grad():
    
      outputs = model_3(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
      
  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  test_accuracy += calculate_accuracy(logits, label_ids)

accuracy = test_accuracy/len(test_dataloader)
print('Accuracy on test set: ', accuracy)

Accuracy on test set:  0.921875


## We almost reached 92 percent accuracy on the test set!

# Test models on a different language text

In [None]:
# I found some text in other languages for testing the models
negative_spanish_text = "Fue la peor película que he visto y no me gustó nada."
positive_russian_text = "Мехради не очень застенчив."
negative_hindi_text = "मुझे फिल्म बिल्कुल पसंद नहीं आई, यह एक भयानक फिल्म थी, घृणित"
positive_persian_text = "امروز هوا عالی است"

# get input ids and attention masks for the text       
text_input_ids, text_attention_masks = get_ids_and_masks(positive_persian_text, tokenizer) # try different inputs here

# copy ids and masks to GPU
text_input_ids = text_input_ids.to(device)
text_attention_masks = text_attention_masks.to(device)

# extract features for the text
with torch.no_grad():
  last_hidden_state = model(text_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=text_attention_masks)
  
text_features = last_hidden_state[0][: , 0, :].cpu().numpy()

# with torch.no_grad():
outputs = model_3(text_input_ids, token_type_ids=None, attention_mask=text_attention_masks)

deep_model_prediction = deep_model.predict(text_features)
logistic_rg_prediction = best_clf.predict(text_features)
logits = outputs[0]

print("Deep model's prediction is:", "Positive\n" if deep_model_prediction[0] > 0.5 else "Negative\n")
print("Logistic regressor's prediction is:", "Positive\n" if logistic_rg_prediction[0] == 1 else "Negative\n")
print("XLM classifier's prediction is:", "Positive\n" if logits.argmax() == 1 else "Negative\n")

Deep model's prediction is: Positive

Logistic regressor's prediction is: Positive

XLM classifier's prediction is: Positive

