# Sentiment analysis on Pretrained model (Bert base multilangual)

# Import Libraries

In [24]:
!pip3 install --progress-bar=on torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [28]:
!pip install transformers



In [27]:
!pip install scikit-learn



In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, recall_score, precision_score

# 1. Get Data
- 4913 Amazon reviews 
- Only keep overall and reviewText column

In [59]:
data=pd.read_csv("C:/Users/jules/Documents/Sentiment_analysis/sentiment_analysis/data/amazon_reviews.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


# 2. Cleaning Data

In [60]:
reviews=data[(data["reviewText"]!="No issues.")][["reviewText", "overall"]]
reviews = reviews[~((reviews['reviewText'].str.startswith('Works fine my Virgin')) & (reviews['overall'] == 5.0))].dropna()
reviews.reset_index(drop=True, inplace=True)
reviews.count()

reviewText    4911
overall       4911
dtype: int64

In [61]:
reviews["overall"].value_counts()

5.0    3919
4.0     526
1.0     244
3.0     142
2.0      80
Name: overall, dtype: int64

In [62]:
reviews["reviewText"]=reviews["reviewText"].astype("string")

This dataframe is not balanced, many reviews have 5 stars, so we will reduce this number to bring it closer to the number of other reviews.
For greater accuracy, we would have needed more negative data.

In [63]:
data_positive = reviews[reviews["overall"] == 5.0].copy()

data_negative = reviews.copy().loc[(reviews['overall'] == 2.0) | (reviews['overall'] == 1.0)]
data_neutral = reviews.copy().loc[(reviews['overall'] == 4.0) | (reviews['overall'] == 3.0)]
                      
data_positive = data_positive.copy().sample(n=600, random_state=42)
data_positive.reset_index()

print("POSITIVE LENGTH: ",len(data_positive))
print("NEUTRAL LENGTH: ",len(data_neutral))
print("NEGATIVE LENGTH : ",len(data_negative)) 

POSITIVE LENGTH:  600
NEUTRAL LENGTH:  668
NEGATIVE LENGTH :  324


In [64]:
clean_data = pd.concat([data_positive,data_neutral,data_negative])

In [65]:
# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [66]:
def sentiment_score(review):
    tokens=tokenizer.encode(review,return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [67]:
clean_data['result'] = clean_data["reviewText"].apply(lambda x:sentiment_score(x[:512]))

In [68]:
accuracy_score(clean_data["result"],clean_data["overall"])

0.5646984924623115

In [69]:
categories = [5.0, 4.0, 3.0, 2.0, 1.0]

schema={'Category':'float64','Precision':'float64','Recall':'float64'}
df_result_star_rating = pd.DataFrame(columns=schema.keys()).astype(schema)

# Calculer la précision et le rappel pour chaque catégorie
for category in categories:
    # Convertir les valeurs de 'expected_result' et 'result' en valeurs binaires basées sur la catégorie actuelle
    expected_binary = (clean_data['overall'] == category).astype(int)
    predicted_binary = (clean_data['result'] == category).astype(int)
    
    # Calculer la précision et le rappel pour la catégorie actuelle
    precision = precision_score(expected_binary, predicted_binary)
    recall = recall_score(expected_binary, predicted_binary)
    
    # Ajouter les résultats à nos listes
    row =  {'Category':category, 'Precision':round(precision,3), 'Recall':round(recall,3)}
    df_result_star_rating = pd.concat([df_result_star_rating, pd.DataFrame.from_records([row])])
    df_result_star_rating.reset_index(drop=True, inplace=True)

df_result_star_rating

Unnamed: 0,Category,Precision,Recall
0,5.0,0.659,0.728
1,4.0,0.567,0.392
2,3.0,0.329,0.366
3,2.0,0.211,0.438
4,1.0,0.698,0.693


In [70]:
clean_data_2=clean_data.copy()
clean_data_2["overall"] = clean_data["overall"].replace({5.: "positive", 4.0: "positive", 3.0:"neutral",2.0:"negative",1.0:"negative"}, inplace=False)
clean_data_2["result"] = clean_data["result"].replace({5: "positive", 4: "positive", 3:"neutral",2:"negative",1:"negative"}, inplace=False)
clean_data_2["result"].value_counts()

positive    1026
negative     408
neutral      158
Name: result, dtype: int64

In [71]:
categories = ["positive","neutral","negative"]

schema={'Category':'string','Precision':'float64','Recall':'float64'}
df_result_sentiment = pd.DataFrame(columns=schema.keys()).astype(schema)


# Calculer la précision et le rappel pour chaque catégorie
for category in categories:
    # Convertir les valeurs de 'expected_result' et 'result' en valeurs binaires basées sur la catégorie actuelle
    expected_binary = (clean_data_2['overall'] == category).astype(int)
    predicted_binary = (clean_data_2['result'] == category).astype(int)
    
    # Calculer la précision et le rappel pour la catégorie actuelle
    precision = precision_score(expected_binary, predicted_binary)
    recall = recall_score(expected_binary, predicted_binary)
    
    # Ajouter les résultats à nos listes
    row =  {'Category':category, 'Precision':round(precision,3), 'Recall':round(recall,3)}
    df_result_sentiment = pd.concat([df_result_sentiment, pd.DataFrame.from_records([row])])
    df_result_sentiment.reset_index(drop=True, inplace=True)

df_result_sentiment

Unnamed: 0,Category,Precision,Recall
0,positive,0.958,0.873
1,neutral,0.329,0.366
2,negative,0.725,0.914
