# Project 5 - Sequence and Sentiment Classification using Transformers

## Part 2: Resource Limited Competition: Sentiment Analysis

## 1. Setup
### 1.1 Dependencies
Disclaimer: The output of cells which do not produce not helpful output (for example the pip install comands) were cleared to make the program easier to read

In [None]:
!pip install datasets transformers sklearn simpletransformers


### 1.2 Imports

In [None]:
import datasets
from datasets import load_dataset
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder

# Misc
import os
import shutil
import csv
import re
from io import StringIO
import requests
import string
import numpy as np
import matplotlib.pyplot as plt  
import seaborn as sn

# Pandas
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Keras
import keras.preprocessing
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, AveragePooling1D, Dense, Dropout, Activation, Embedding
from keras import backend as K
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# tensorflow
import tensorflow as tf

# Torch
import torch

# Sklearn
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,f1_score

# simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

### 1.3 Constants

In [None]:
train_range=(10000,15000)
test_range=(11500,13500)

### 1.4 Environment
We check if the environment we are using is properly setup, such that we are using GPU for training our models.

In [None]:
# Check if device supports CUDA interface
CUDA = torch.cuda.is_available()
# Make program run on gpu (cuda:0) if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu:0')
torch.cuda.set_device(device)
print('Using device:', device)

Using device: cuda:0


In [None]:
# Check and print information about available GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Dec  2 13:09:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    29W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Get GPU name
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-0673fcbd-ec45-9e6e-1f3c-28e12c2e13c9)


In [None]:
# Check Memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


## Import


Here we import the data from the Stanford Repository.

In [None]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


After download the data from the above link, we found that the directory sturcture is like this:

main_directory/                 
...train/                
......a_text_1.txt                
......a_text_2.txt                
...test/                
......a_text_1.txt                
......a_text_2.txt                
...unsup/                
......                

We formalize the path to the main directory and its subdirectory. We also remove the "unsup" directory which contains unlabeled reviews for unsupervised learning.

In [None]:
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
train_dir = os.path.join(main_dir, 'train')
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)   

In [None]:
# read data into dataframe: train_data. According to the requeirement, we read train[10000:15000] as train_data.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=25000,
    shuffle=False,
    validation_split=0)

for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train_data=train[train_range[0]:train_range[1]]

Found 25000 files belonging to 2 classes.


In [None]:
# similarly read the test data into dataframe. According to the requeirement, we read test[11500:13500] as test_data.
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=25000,
    shuffle=False,
    validation_split=0)

for i in test.take(1):
  test_feat = i[0].numpy()
  test_lab = i[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test_data=test[test_range[0]:test_range[1]]

Found 25000 files belonging to 2 classes.


In [None]:
print("Train_data has a shape of {}. \n\n The number of positive(1) and negative(0) reviews are:\n {}".format(
    train_data.shape,train_data['LABEL_COLUMN'].value_counts()))

Train_data has a shape of (5000, 2). 

 The number of positive(1) and negative(0) reviews are:
 1    2500
0    2500
Name: LABEL_COLUMN, dtype: int64


In [None]:
print("Test_data has a shape of {}. \n\n The number of positive(1) and negative(0) reviews are:\n {}".format(
    test_data.shape,test_data['LABEL_COLUMN'].value_counts()))

Test_data has a shape of (2000, 2). 

 The number of positive(1) and negative(0) reviews are:
 1    1000
0    1000
Name: LABEL_COLUMN, dtype: int64


In [None]:
train_data

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
10000,"First, the CGI in this movie was horrible. I w...",0
10001,The film is about a sabretooth on the lose at ...,0
10002,Everything about this film is hog wash. Pitifu...,0
10003,Spoilers will be in this. The movie could have...,0
10004,Three giant sabretooth tigers(..created in a l...,0
...,...,...
14995,The minute I started watching this I realised ...,1
14996,i really loved this version of Emma the best. ...,1
14997,Until the 1990s there had never been a film ba...,1
14998,Old Jane's mannered tale seems very popular th...,1


## Models and Classification Arguements
Here we import some Models and compare their performances in prediction. 

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

from sklearn.preprocessing import MultiLabelBinarizer

def f1(predictions, outputs):
    mlb = MultiLabelBinarizer()
    return f1_score(
        mlb.fit_transform(test_data['LABEL_COLUMN']),
        mlb.fit_transform(predictions),
        average='weighted'
    )

In [None]:
y_true = test_data['LABEL_COLUMN'].values.tolist()

def evaluate(model):
  try:
    y_pred=model.predict(test_data['DATA_COLUMN'].values.tolist())[0].tolist()
  except:
    y_pred=model.predict(test_data['DATA_COLUMN'].values.tolist())[0]
  accuracy=accuracy_score(y_true=y_true,y_pred=y_pred)
  f1_macro=f1_score(y_true=y_true,y_pred=y_pred,average="macro")
  f1_micro=f1_score(y_true=y_true,y_pred=y_pred,average="micro")
  metric={"model_name":model.config._name_or_path,"accuracy":accuracy,"f1_macro":f1_macro,"f1_micro":f1_micro}
  return metric

In [None]:

model_args = ClassificationArgs(num_train_epochs=10,
                                use_early_stopping=True,
                                output_dir="outputs/",
                                overwrite_output_dir=True,
                                 weight_decay=0.0001,
                                train_batch_size=64
                                )

### Model 1: distilbert-base-uncased-finetuned-sst-2-english
This model is based on the DistilBERT base model, which is the distilled version of the BERT base model and is later fine-tunned by the Stanford Sentiment Treebank(SST). The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. We use the two-way (positive/negative) class split, and use only sentence-level labels.

In [None]:


model1 = ClassificationModel(
    "distilbert", 
    "distilbert-base-uncased-finetuned-sst-2-english", 
    args=model_args,
    num_labels=2,
    weight=[1,1]
)

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [None]:
# Evaluate the model
metric1_before_training=evaluate(model1)
print("Model1")
metric1_before_training

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model1


{'accuracy': 0.8075,
 'f1_macro': 0.8066705684061043,
 'f1_micro': 0.8075,
 'model_name': 'distilbert-base-uncased-finetuned-sst-2-english'}

In [None]:
model1.train_model(train_data)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/5000 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_distilbert_128_2_2


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.


(790, 0.10243374093471071)

In [None]:
# Evaluate the model
metric1=evaluate(model1)
print("Model1")
metric1

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model1


{'accuracy': 0.8385,
 'f1_macro': 0.8384660374843811,
 'f1_micro': 0.8385,
 'model_name': 'distilbert-base-uncased-finetuned-sst-2-english'}

### Model 2: echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid
This model is interesting because it introduce a block pruning methods

In [None]:
model2 = ClassificationModel(
    "bert", 
    "echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid", 
    args=model_args,
    num_labels=2,
    weight=[1,1]
)

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/352M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

In [None]:
# Evaluate the model
metric2_before_training=evaluate(model2)
print("Model2")
metric2_before_training

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model2


{'accuracy': 0.7055,
 'f1_macro': 0.6867753522514285,
 'f1_micro': 0.7055,
 'model_name': 'echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid'}

In [None]:
model2.train_model(train_data)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/5000 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_128_2_2


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.


(790, 0.12338230933145254)

In [None]:
# Evaluate the model
metric2=evaluate(model2)
print("Model2")
metric2

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model2


{'accuracy': 0.8425,
 'f1_macro': 0.8423975189867211,
 'f1_micro': 0.8425,
 'model_name': 'echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid'}


### Model 3: gchhablani/bert-base-cased-finetuned-sst2
Compared to the bert-base-cased model, this model replaces the self-attention sublayers with simple linear transformations that "mix" input tokens. They show that Transformer encoder can be sped up, with limited accuracy costs. In this paper, they showed that these linear mixers, along with standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text classification tasks.

In [None]:


model3 = ClassificationModel(
    "bert", 
    "gchhablani/bert-base-cased-finetuned-sst2", 
    args=model_args,
    num_labels=2,
    weight=[1,1]
)

Downloading:   0%|          | 0.00/879 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/320 [00:00<?, ?B/s]

In [None]:
# Evaluate the model
metric3_before_training=evaluate(model3)
print("Model3")
metric3_before_training

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model3


{'accuracy': 0.822,
 'f1_macro': 0.8219855808320473,
 'f1_micro': 0.822,
 'model_name': 'gchhablani/bert-base-cased-finetuned-sst2'}

In [None]:
model3.train_model(train_data)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/5000 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_128_2_2


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.


(790, 0.08663941799085352)

In [None]:
# Evaluate the model
metric3=evaluate(model3)
print("Model3:")
metric3

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model3:


{'accuracy': 0.8555,
 'f1_macro': 0.8554606491617343,
 'f1_micro': 0.8554999999999999,
 'model_name': 'gchhablani/bert-base-cased-finetuned-sst2'}



### Model 4: roberta-base
This model is introduced in this paper:[Liu et al. 2019](https://arxiv.org/pdf/1907.11692.pdf). According to [Liu et al. 2019](https://arxiv.org/pdf/1907.11692.pdf), this model has improved in the following 4 aspects:
 - (1) training the model longer, with bigger batches,
over more data; 
 - (2) removing the next sentence
prediction objective; 
 - (3) training on longer sequences; and 
 - (4) dynamically changing the masking pattern applied to the training data.    
               
They also collect a large new dataset (CC-NEWS) of comparable size to other privately used datasets, to better control for training set size effects.


In [None]:

model4 = ClassificationModel(
    "roberta", 
    "roberta-base", 
    args=model_args,
    num_labels=2,
    weight=[1,1]
)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# Evaluate the model
metric4_before_training=evaluate(model4)
print("Model4")
metric4_before_training

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model4


{'accuracy': 0.5,
 'f1_macro': 0.3333333333333333,
 'f1_micro': 0.5,
 'model_name': 'roberta-base'}

In [None]:
model4.train_model(train_data)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/5000 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_2_2


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/79 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(790, 0.1401088878889627)

In [None]:
# Evaluate the model
metric4=evaluate(model4)
print("Model4:")
metric4

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model4:


{'accuracy': 0.8825,
 'f1_macro': 0.88249926562041,
 'f1_micro': 0.8825,
 'model_name': 'roberta-base'}



### Model 5:  siebert/sentiment-roberta-large-english
This model is a fine-tuned checkpoint of RoBERTa-large ([Liu et al. 2019](https://arxiv.org/pdf/1907.11692.pdf)). It enables reliable binary sentiment analysis for various types of English-language text. 


In [None]:

model5 = ClassificationModel(
    "roberta", 
    "siebert/sentiment-roberta-large-english", 
    args=model_args,
    num_labels=2,
    weight=[1,1]
)

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

In [None]:
# Evaluate the model
metric5_before_training=evaluate(model5)
print("Model5")
metric5_before_training

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Model5


{'accuracy': 0.888,
 'f1_macro': 0.8879864463600096,
 'f1_micro': 0.888,
 'model_name': 'siebert/sentiment-roberta-large-english'}

In [None]:
model5.train_model(train_data)

In [None]:
# Evaluate the model
metric5=evaluate(model5)
print("Model5:")
metric5

## Summary

In [None]:
# before fine tuning
performance_before_training = pd.DataFrame([metric1_before_training,metric2_before_training,metric3_before_training,metric4_before_training,metric5_before_training])
performance_before_training = performance_before_training[['model_name', 'accuracy', 'f1_macro', 'f1_micro']]
#after fine tuning
performance_comparison = pd.DataFrame([metric1,metric2,metric3,metric4])
performance_comparison = performance_comparison[['model_name', 'accuracy', 'f1_macro', 'f1_micro']]

In [None]:
performance_before_training

Unnamed: 0,model_name,accuracy,f1_macro,f1_micro
0,distilbert-base-uncased-finetuned-sst-2-english,0.8075,0.806671,0.8075
1,echarlaix/bert-base-uncased-sst2-acc91.1-d37-h...,0.7055,0.686775,0.7055
2,gchhablani/bert-base-cased-finetuned-sst2,0.822,0.821986,0.822
3,roberta-base,0.5,0.333333,0.5
4,siebert/sentiment-roberta-large-english,0.888,0.887986,0.888


In [None]:
performance_comparison

Unnamed: 0,model_name,accuracy,f1_macro,f1_micro
0,distilbert-base-uncased-finetuned-sst-2-english,0.8385,0.838466,0.8385
1,echarlaix/bert-base-uncased-sst2-acc91.1-d37-h...,0.8425,0.842398,0.8425
2,gchhablani/bert-base-cased-finetuned-sst2,0.8555,0.855461,0.8555
3,roberta-base,0.8825,0.882499,0.8825


Besides the model 5, "siebert/sentiment-roberta-large-english", which is fine-tuned on a large dataset, we found that the prediction accuracy of Model 4, the "roberta-base", has increased the most and reached the highest(0.88). The reasons for this is that, comparied to the BERT model that model 1, 2 and 3 based on,  the RoBERTa is pre-trained longer and with bigger batches and more data and longer sequence. Although we can't tell their removing the next sentence prediction objective is benefitial to the model, their using a dynamical masking pattern definitely helps. We will see if we can improve the model further in the future.