In [116]:
import pandas as pd
import numpy as np

In [117]:
data = pd.read_csv('yelp.csv')
data

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,VY_tvNUCCXGXQeSvJl757Q,2012-07-28,Ubyfp2RSDYW0g7Mbr8N3iA,3,First visit...Had lunch here today - used my G...,review,_eqQoPtQ3e3UxLE4faT6ow,1,2,0
9996,EKzMHI1tip8rC1-ZAy64yg,2012-01-18,2XyIOQKbVFb6uXQdJ0RzlQ,4,Should be called house of deliciousness!\n\nI ...,review,ROru4uk5SaYc3rg8IU7SQw,0,0,0
9997,53YGfwmbW73JhFiemNeyzQ,2010-11-16,jyznYkIbpqVmlsZxSDSypA,4,I recently visited Olive and Ivy for business ...,review,gGbN1aKQHMgfQZkqlsuwzg,0,0,0
9998,9SKdOoDHcFoxK5ZtsgHJoA,2012-12-02,5UKq9WQE1qQbJ0DJbc-B6Q,2,My nephew just moved to Scottsdale recently so...,review,0lyVoNazXa20WzUyZPLaQQ,0,0,0


In [118]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [119]:
df = data[:30]
for column in list(df.columns):
    print(column)

business_id
date
review_id
stars
text
type
user_id
cool
useful
funny


<h1>Features to use</h1>

1. review_id
2. stars
3. text



In [120]:
reviews = df[['review_id','text','stars']]
reviews.head()
len(reviews)


30

In [121]:
def get_sentiment(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)  
    return int(torch.argmax(result.logits)) + 1

In [122]:
reviews['sentiment'] = reviews['text'].apply(lambda x: get_sentiment(x))
reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['sentiment'] = reviews['text'].apply(lambda x: get_sentiment(x))


Unnamed: 0,review_id,text,stars,sentiment
0,fWKvX83p0-ka4JS3dc6E5A,My wife took me here on my birthday for breakf...,5,5
1,IjZ33sJrzXqU-0X6U8NwyA,I have no idea why some people give bad review...,5,5
2,IESLBzqUCLdSzSqm0eCSxQ,love the gyro plate. Rice is so good and I als...,4,5
3,G-WvGaISbqqaMHlNnByodA,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5,5
4,1uJFq2r5QfJG_6ExMRCaGw,General Manager Scott Petello is a good egg!!!...,5,4
5,m2CKSsepBCoRYWxiRUsxAg,"Quiessence is, simply put, beautiful. Full wi...",4,4
6,riFQ3vxNpP4rWLk_CSri2A,Drop what you're doing and drive here. After I...,5,2
7,JL7GXJ9u4YMx7Rzs05NfiQ,"Luckily, I didn't have to travel far to make m...",4,3
8,XtnfnYmnJYi71yIuGsXIUA,Definitely come for Happy hour! Prices are ama...,4,5
9,jJAIXA46pU1swYyRCdfXtQ,Nobuo shows his unique talents with everything...,5,4


In [123]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(reviews['stars'], reviews['sentiment'])
mae = mean_absolute_error(reviews['stars'], reviews['sentiment'])
r2 = r2_score(reviews['stars'], reviews['sentiment'])

print(f"MSE: {mse*100:2f}%")
print(f"MAE: {mae*100:2f}%")
print(f"R2: {r2*100:2f}%")

MSE: 83.333333%
MAE: 56.666667%
R2: 13.194444%


In [124]:
def rating(star):
    star = int(star)
    if star <= 3:
        return 'negative'
    else:
        return 'positive'
reviews['real_ratings'] = reviews['stars'].apply(rating)
reviews['sentiment_ratings'] = reviews['sentiment'].apply(rating)
reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['real_ratings'] = reviews['stars'].apply(rating)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['sentiment_ratings'] = reviews['sentiment'].apply(rating)


Unnamed: 0,review_id,text,stars,sentiment,real_ratings,sentiment_ratings
0,fWKvX83p0-ka4JS3dc6E5A,My wife took me here on my birthday for breakf...,5,5,positive,positive
1,IjZ33sJrzXqU-0X6U8NwyA,I have no idea why some people give bad review...,5,5,positive,positive
2,IESLBzqUCLdSzSqm0eCSxQ,love the gyro plate. Rice is so good and I als...,4,5,positive,positive
3,G-WvGaISbqqaMHlNnByodA,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5,5,positive,positive
4,1uJFq2r5QfJG_6ExMRCaGw,General Manager Scott Petello is a good egg!!!...,5,4,positive,positive
5,m2CKSsepBCoRYWxiRUsxAg,"Quiessence is, simply put, beautiful. Full wi...",4,4,positive,positive
6,riFQ3vxNpP4rWLk_CSri2A,Drop what you're doing and drive here. After I...,5,2,positive,negative
7,JL7GXJ9u4YMx7Rzs05NfiQ,"Luckily, I didn't have to travel far to make m...",4,3,positive,negative
8,XtnfnYmnJYi71yIuGsXIUA,Definitely come for Happy hour! Prices are ama...,4,5,positive,positive
9,jJAIXA46pU1swYyRCdfXtQ,Nobuo shows his unique talents with everything...,5,4,positive,positive


In [125]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(reviews['real_ratings'], reviews['sentiment_ratings'])
print(f"Accuracy Score: {accuracy*100:2f}%")

Accuracy Score: 86.666667%


<h1>Using DistilBERT</h1>

In [137]:
from transformers import pipeline

reviews = data[['review_id','text','stars']]
sentiment_pipeline = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
reviews["distil_sentiment"] = reviews["text"].apply(lambda x: sentiment_pipeline(x[:512])[0]['label'])
reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews["distil_sentiment"] = reviews["text"].apply(lambda x: sentiment_pipeline(x[:512])[0]['label'])


Unnamed: 0,review_id,text,stars,distil_sentiment
0,fWKvX83p0-ka4JS3dc6E5A,My wife took me here on my birthday for breakf...,5,POSITIVE
1,IjZ33sJrzXqU-0X6U8NwyA,I have no idea why some people give bad review...,5,NEGATIVE
2,IESLBzqUCLdSzSqm0eCSxQ,love the gyro plate. Rice is so good and I als...,4,POSITIVE
3,G-WvGaISbqqaMHlNnByodA,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5,POSITIVE
4,1uJFq2r5QfJG_6ExMRCaGw,General Manager Scott Petello is a good egg!!!...,5,POSITIVE
...,...,...,...,...
9995,Ubyfp2RSDYW0g7Mbr8N3iA,First visit...Had lunch here today - used my G...,3,POSITIVE
9996,2XyIOQKbVFb6uXQdJ0RzlQ,Should be called house of deliciousness!\n\nI ...,4,POSITIVE
9997,jyznYkIbpqVmlsZxSDSypA,I recently visited Olive and Ivy for business ...,4,POSITIVE
9998,5UKq9WQE1qQbJ0DJbc-B6Q,My nephew just moved to Scottsdale recently so...,2,NEGATIVE


In [138]:
reviews['distil_sentiment'] = reviews['distil_sentiment'].str.lower()
reviews


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['distil_sentiment'] = reviews['distil_sentiment'].str.lower()


Unnamed: 0,review_id,text,stars,distil_sentiment
0,fWKvX83p0-ka4JS3dc6E5A,My wife took me here on my birthday for breakf...,5,positive
1,IjZ33sJrzXqU-0X6U8NwyA,I have no idea why some people give bad review...,5,negative
2,IESLBzqUCLdSzSqm0eCSxQ,love the gyro plate. Rice is so good and I als...,4,positive
3,G-WvGaISbqqaMHlNnByodA,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5,positive
4,1uJFq2r5QfJG_6ExMRCaGw,General Manager Scott Petello is a good egg!!!...,5,positive
...,...,...,...,...
9995,Ubyfp2RSDYW0g7Mbr8N3iA,First visit...Had lunch here today - used my G...,3,positive
9996,2XyIOQKbVFb6uXQdJ0RzlQ,Should be called house of deliciousness!\n\nI ...,4,positive
9997,jyznYkIbpqVmlsZxSDSypA,I recently visited Olive and Ivy for business ...,4,positive
9998,5UKq9WQE1qQbJ0DJbc-B6Q,My nephew just moved to Scottsdale recently so...,2,negative


In [142]:

reviews['real_ratings'] = reviews['stars'].apply(rating)

distil_acc = accuracy_score(reviews['real_ratings'], reviews['distil_sentiment'])
distil_acc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['real_ratings'] = reviews['stars'].apply(rating)


0.8133

In [143]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(reviews["real_ratings"], reviews["distil_sentiment"]))
print("\nConfusion Matrix:\n", confusion_matrix(reviews["real_ratings"], reviews["distil_sentiment"]))

              precision    recall  f1-score   support

    negative       0.72      0.66      0.69      3137
    positive       0.85      0.88      0.87      6863

    accuracy                           0.81     10000
   macro avg       0.79      0.77      0.78     10000
weighted avg       0.81      0.81      0.81     10000


Confusion Matrix:
 [[2075 1062]
 [ 805 6058]]
