## Here we will use Roberta Inference with HuggingFace Libraries for making the model

In [1]:
# Importing dependencies

# Data handling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from scipy.special import softmax

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

# Hugging Face
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# Others
from sklearn.metrics import f1_score
import random
import os

In [2]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
def extract_polarity_scores(test_string):
    encoded = tokenizer(test_string, return_tensors='pt')
    output = model(**encoded)
    scores = output[0][0].detach().numpy() # Converting from tensor object to numpy array
    soft_scores = softmax(scores)
    scores_dict = {
        'roberta_neg': soft_scores[0],
        'roberta_neu': soft_scores[1],
        'roberta_pos': soft_scores[2],
    }
    return scores_dict

In [8]:
# Trying Roberta Model on a sample text
extract_polarity_scores("I hate this. This is so bad!")

{'roberta_neg': 0.98044556,
 'roberta_neu': 0.015930237,
 'roberta_pos': 0.0036241112}

In [9]:
extract_polarity_scores("This is really mind-blowing. Will try this again for sure")

{'roberta_neg': 0.003799245,
 'roberta_neu': 0.028811913,
 'roberta_pos': 0.96738887}

### We can see that the loaded model works perfectly

In [10]:
from tqdm.notebook import tqdm

#### Let us load the training dataset

In [16]:
df = pd.read_csv('Cleaned_Data.csv')
df

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...,...
393574,568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
393575,568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
393576,568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
393577,568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [19]:
def get_predictions(df):
    res = {}
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            text = row['Text']
            myid = row['Id']
            roberta_result = extract_polarity_scores(text)
            res[myid] = roberta_result
        except RuntimeError:
            print(f'Cannot work for id: {myid}')
    return res

In [20]:
# Since the dataset is very big and we are not training the model, we can predict for a few examples

results = get_predictions(df.head(1000))

  0%|          | 0/1000 [00:00<?, ?it/s]

Cannot work for id: 83
Cannot work for id: 187
Cannot work for id: 529
Cannot work for id: 540
Cannot work for id: 746
Cannot work for id: 863


In [31]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df.head(1000), how='left')

In [32]:
results_df

Unnamed: 0.1,Id,roberta_neg,roberta_neu,roberta_pos,Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,0.009624,0.049980,0.940395,0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,0.508986,0.452414,0.038600,1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,0.003229,0.098067,0.898704,2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,0.002295,0.090219,0.907486,3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,0.001635,0.010302,0.988063,4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,230,0.867932,0.115751,0.016317,229,B001L4ELRW,A1AK335P17JXL0,Victoria Brown,0,0,1,1345593600,Not Good,This candy is not as described. The middle is ...
227,231,0.001267,0.015075,0.983658,230,B003SO503C,A2ODZ3CH8PMYTL,Nanette,0,0,5,1347235200,Love this tea!,I started drinking the power slim tea when I w...
228,232,0.001096,0.032065,0.966839,231,B003SO503C,A3OXRFCJI67IMN,Bold Consumer,0,0,5,1333843200,Really Nice Taste!,I'm trying several of the Wu Yi teas. I like t...
229,233,0.002680,0.018355,0.978965,232,B003ZFXJDW,A2HL6876LDPJIM,flageolet,0,0,5,1343692800,just give me some watermelon and citron sea salt,"i cannot live without this citron falksalt, it..."
