In [28]:
import sqlite3
import pandas as pd
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import numpy as np
from scipy.special import softmax
from datetime import datetime

In [60]:
con_rev = sqlite3.connect('../database/review_en.db')
df_rev = pd.read_sql_query("SELECT * from review_en", con_rev)
df_rev.head()

Unnamed: 0,index,id,wine_id,rating,note,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code
0,0,3975,1101258,4.0,Excellent full bodied wine,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en
1,1,4202,76378,5.0,The Real price at phillipson is around 1299 dk...,2011-08-06T20:52:53.000Z,0.0,1.0,images.vivino.com/labels/1311959128_1819.jpg,48500,39,en
2,2,4261,1231210,5.0,"Clean, crisp and full of nuance and elegance, ...",2011-08-11T08:06:39.000Z,3.0,1.0,images.vivino.com/labels/1312622458_6592.jpg,53601,17,en
3,3,4264,1178663,4.0,"Crisp and clean Champagne, a certain fave amon...",2011-08-11T09:28:07.000Z,25.0,2.0,images.vivino.com/labels/1312966707_6129.jpg,53601,22,en
4,4,4354,17998,4.0,It is not a champagne type wine! It is a red w...,2011-08-12T16:53:19.000Z,0.0,1.0,images.vivino.com/labels/1311842730_8268.jpg,47873,12,en


In [None]:
df_rev.created_at = df_rev.apply(lambda row: datetime.strptime(row.created_at, '%Y-%m-%dT%H:%M:%S.000Z'), axis=1, )

In [None]:
greater_one = df_rev.groupby(by='user_id').filter(lambda x: x['rating'].count() > 5)

### Sentiment Analysis
The following pre-processing pipeline and sentiment analysis has been taken from: <a href='https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=Covid+cases+are+increasing+fast%21'>Huggingface Sentiment Analysis</a>

In [30]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
start = datetime.now()
for i in df_rev[df_rev.rating < 2].head(n=100).index:
    sentiment = softmax(model(
        **tokenizer(
            preprocess(df_rev.loc[i]['note']), return_tensors='pt')
    )[0][0].detach())
    if np.where(sentiment == sentiment.max())[0][0] == 0:
        df_rev.at[i, 'sentiment'] = sentiment.max() * -1
    else:
        df_rev.at[i, 'sentiment'] = sentiment.max()
print(datetime.now() - start)

0:00:07.266468


In [72]:
(((len(df_rev) / 100) * 7) / 60 ) / 60

80.24028055555556

In [59]:
df_rev[df_rev.rating < 2]

Unnamed: 0,index,id,wine_id,rating,note,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment
10,10,5580,2547,1.0,My experience with this wine is...Unable to dr...,2011-11-06T20:11:54.000Z,1.0,0.0,images.vivino.com/thumbs/fC6vyIS7TIOzF6dPKhiJV...,59314,28,en,-0.954367
148,148,94108,1136300,1.0,Disappointing. Does diserve cru classé rating,2012-06-24T18:01:18.000Z,1.0,1.0,images.vivino.com/labels/1340557624_9500.jpg,223344,6,en,-0.906509
182,182,116809,2693,1.0,"This is horrible, Baby Duck of Red wine! Save ...",2012-07-13T00:51:15.000Z,0.0,1.0,images.vivino.com/labels/1342135341_1620.jpg,260618,18,en,-0.933570
185,185,120123,2547,1.0,Worst wine I have tasted.,2012-07-14T19:33:42.000Z,1.0,1.0,images.vivino.com/labels/1333902709_551.jpg,161153,5,en,-0.947673
201,201,135547,1130030,1.0,Almost not worth drinking. If you are making a...,2012-07-19T12:29:35.000Z,0.0,0.0,images.vivino.com/labels/1298144436_9963.jpg,13104,26,en,-0.731541
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126438,4126438,257730538,10247,1.5,Had at J/J with Danny. Came in last place of 3...,2022-10-17T14:37:17.000Z,0.0,0.0,images.vivino.com/labels/QcA_-hMzT5SbPJT2EFBRH...,30498966,14,en,
4126477,4126477,257744562,3309775,1.5,pietroso@[1|54354071|Samuele Biondi],2022-10-17T18:08:17.000Z,0.0,0.0,images.vivino.com/thumbs/RCYVvwasRrSboD6oAuvhA...,54851792,2,en,
4126504,4126504,257755600,6410715,1.5,Not very good,2022-10-17T20:06:40.000Z,0.0,0.0,images.vivino.com/labels/uK5qz91UQ0Gl7KXw7BMzN...,56906378,3,en,
4126626,4126626,257804077,5946114,1.5,not my favourite wine at all. somewhat bitter.,2022-10-18T18:02:50.000Z,,,images.vivino.com/labels/TN96cLupTWmhJmIHhCvTw...,59067266,8,en,
