SiEBERT - English-Language Sentiment Classification



In [1]:
! pip install transformers
! pip install pandas
! pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 43.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 24.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading 

In [2]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import time
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [4]:
# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [5]:
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [7]:
# Create list of texts (can be imported from .csv, .xls etc.)
path_to_folder = "/content/drive/My Drive/data/cz4045/"
test_raw1 = pd.read_csv(path_to_folder + 'test_df_Bryson.csv')
test_raw2 = pd.read_csv(path_to_folder + 'test_df_Gx.csv')
test_raw3 = pd.read_csv(path_to_folder + 'test_df_Kelvin.csv')
df_list = [test_raw1, test_raw2, test_raw3]
test_df = pd.concat(df_list, ignore_index=True)

In [8]:
test_df = test_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])
print(len(test_df))
test_df.head()

2330


Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category,languages,Annotator_1,Annotator_2
0,1982137452,1,The content is all messed up,,I started this book this week for my book club...,True,children,Language.ENGLISH,-1,-1
1,125030170X,1,Duplicate copy.Damaged book.,,Pages missing.,True,children,Language.ENGLISH,-1,-1
2,63215381,1,Awful,,I gave up after 38% of my Kindle. Yes we were ...,True,children,Language.ENGLISH,-1,-1
3,60935464,1,Syrupy Overload,3.0,The book is an example of leading the witness.,True,children,Language.ENGLISH,-1,-1
4,1501161938,1,Couldn‚Äôt read it; type too small!,1.0,"Beware, the type is TINY, I mean TINY. I am 60...",True,children,Language.ENGLISH,-1,-1


In [9]:
test = test_df.loc[test_df.Annotator_1 != 0] # SiEBERT only does positive and negative classification, we drop neutral reviews
test['text'] = test['reviewTitle'] + '. ' + test['reviewDescription']
test = test[['text', 'Annotator_1']].copy()
test = test.rename(columns={'Annotator_1': 'polarity'})
test.loc[test['polarity'] == -1, 'polarity'] = 0 # change negative label from -1 to 0 according to SiEBERT's expectations
pred_texts = test['text'].dropna().astype('str').tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding='max_length')
pred_dataset = SimpleDataset(tokenized_texts)

In [11]:
# Run predictions
start_time = time.time()
predictions = trainer.predict(pred_dataset)
time_taken = time.time() - start_time
rec_classified = len(test)/time_taken
print("Predictions took ", time_taken, " seconds")
print("Number of reviews classified per second: ", rec_classified)

***** Running Prediction *****
  Num examples = 2227
  Batch size = 8


Predictions took  219.03862833976746  seconds
Number of reviews classified per second:  10.167156436651581


In [12]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [13]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
df

Unnamed: 0,text,pred,label,score
0,The content is all messed up. I started this b...,0,NEGATIVE,0.999510
1,Duplicate copy.Damaged book.. Pages missing.,0,NEGATIVE,0.999505
2,Awful. I gave up after 38% of my Kindle. Yes w...,0,NEGATIVE,0.999507
3,Syrupy Overload. The book is an example of lea...,0,NEGATIVE,0.999444
4,"Couldn‚Äôt read it; type too small!. Beware, t...",0,NEGATIVE,0.999496
...,...,...,...,...
2222,No. Just awful!,0,NEGATIVE,0.999451
2223,Bored. I was so bored reading this book. I swi...,0,NEGATIVE,0.999505
2224,"Ugh!. Ugh! Too wordy, predictable and shallow....",0,NEGATIVE,0.999507
2225,A story that made me cry. I have so many fond ...,1,POSITIVE,0.998895


In [14]:
df = df.merge(test, on='text')
df

Unnamed: 0,text,pred,label,score,polarity
0,The content is all messed up. I started this b...,0,NEGATIVE,0.999510,0
1,Duplicate copy.Damaged book.. Pages missing.,0,NEGATIVE,0.999505,0
2,Awful. I gave up after 38% of my Kindle. Yes w...,0,NEGATIVE,0.999507,0
3,Syrupy Overload. The book is an example of lea...,0,NEGATIVE,0.999444,0
4,"Couldn‚Äôt read it; type too small!. Beware, t...",0,NEGATIVE,0.999496,0
...,...,...,...,...,...
2222,No. Just awful!,0,NEGATIVE,0.999451,0
2223,Bored. I was so bored reading this book. I swi...,0,NEGATIVE,0.999505,0
2224,"Ugh!. Ugh! Too wordy, predictable and shallow....",0,NEGATIVE,0.999507,0
2225,A story that made me cry. I have so many fond ...,1,POSITIVE,0.998895,1


In [15]:
y_true = df['polarity'].tolist()
y_pred = df['pred'].tolist()

In [16]:
acc = accuracy_score(y_true, y_pred)
print('Test Accuracy: ', acc)

Test Accuracy:  0.9744050291872475


In [17]:
precision = precision_score(y_true, y_pred)
print('Test Precision: ', precision)

Test Precision:  0.9853316326530612


In [18]:
recall = recall_score(y_true, y_pred)
print('Test Recall: ', recall)

Test Recall:  0.9784673844205193


In [19]:
f1 = f1_score(y_true, y_pred)
print('Test F-measure: ', f1)

Test F-measure:  0.9818875119161105


In [20]:
results = {'Model':['SiEBERT'], 'Test Accuracy':[acc], 'Test Precision':[precision], 'Test Recall':[recall], 'Test F1':[f1], 'Time for Predictions':[time_taken], 'No. reviews classified per second':[rec_classified]}
results_df = pd.DataFrame.from_dict(results)
results_df.to_csv(path_to_folder+'siebert_results.csv', index=False)