In [25]:
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [26]:
model_name = "/kaggle/input/distillbert_sentiment/pytorch/default/1/distilbert_finetuned_setiment"

In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [60]:
tok = AutoTokenizer.from_pretrained(model_name)
mod = AutoModelForSequenceClassification.from_pretrained(model_name)


def convert_to_normalized_score2(probability):
    if probability > 0:
        # Map [0.5, 1] to [0, 1]
        score = 2 * (probability - 0.5)
    elif probability < 0:
        # Map [-1, -0.5] to [-1, 0]
        score = 2 * (probability + 1) - 1
    else:
        raise ValueError("Invalid class label")

    return score


def get_prediction(text):
  input_ids = tok.encode(text, truncation=True, max_length=512, return_tensors='pt')
  output = mod(input_ids)

  preds = torch.nn.functional.softmax(output.logits, dim=-1)

  prob = torch.max(preds).item()

  idx = torch.argmax(preds).item()
#   sentiment = id2label[idx]

  if idx:
    return prob
    # Map [0.5, 1] to [0, 1]
    # return 2 * (prob - 0.5)
  else:
    # Map [-1, -0.5] to [-1, 0]
    # return -prob
    return 2 * (-prob + 1) - 1

In [61]:
text = "The movie was okay"
get_prediction(text)

0.5

In [40]:
# Load data from CSV files
file_paths = [
                '/kaggle/input/tt0111161/cleaned_scored_reviews_tt0111161.csv',
                '/kaggle/input/tt0455944/cleaned_scored_reviews_tt0455944.csv',
                '/kaggle/input/tt0468569/cleaned_scored_reviews_tt0468569.csv',
                '/kaggle/input/tt15398776/cleaned_scored_reviews_tt15398776.csv'
             ]
dfs = [pd.read_csv(file) for file in file_paths]
df2 = pd.concat(dfs, ignore_index=True)
df2 = df2[df2['Review'].notna()]
df2 = df2[df2['Sentiment_Score'].notna()]

print(df2.head())

print('=====')

print(df2.info())

                                           Title              Author  \
0           Some birds aren't meant to be caged.  hitchcockthelegend   
1  An incredible movie. One that lives with you.      Sleepin_Dragon   
2                          Don't Rent Shawshank.            EyeDunno   
3  A classic piece of unforgettable film-making.            kaspen12   
4              This is How Movies Should Be Made    alexkolokotronis   

               Date  Stars(out_of_10)  \
0      24 July 2010              10.0   
1  17 February 2021              10.0   
2  21 November 2005              10.0   
3  10 February 2006              10.0   
4  18 February 2008              10.0   

                                              Review  Sentiment_Score  \
0  the shawshank redemption is written and direct...           0.9983   
1  it is no wonder that the film has such a high ...           0.9766   
2  im trying to save you money this is the last f...           0.9935   
3  in its oscar year shawsha

In [41]:
df2['BERT_Sentiment_Score'] = df2['Review'].progress_apply(lambda x: get_prediction(x))

100%|██████████| 24845/24845 [44:03<00:00,  9.40it/s]  


In [42]:
df2.head()

Unnamed: 0,Title,Author,Date,Stars(out_of_10),Review,Sentiment_Score,Polarity,BERT_Sentiment_Score
0,Some birds aren't meant to be caged.,hitchcockthelegend,24 July 2010,10.0,the shawshank redemption is written and direct...,0.9983,1,0.981635
1,An incredible movie. One that lives with you.,Sleepin_Dragon,17 February 2021,10.0,it is no wonder that the film has such a high ...,0.9766,1,0.987899
2,Don't Rent Shawshank.,EyeDunno,21 November 2005,10.0,im trying to save you money this is the last f...,0.9935,1,0.981112
3,A classic piece of unforgettable film-making.,kaspen12,10 February 2006,10.0,in its oscar year shawshank redemption written...,0.9993,1,0.986902
4,This is How Movies Should Be Made,alexkolokotronis,18 February 2008,10.0,this movie is not your ordinary hollywood flic...,0.9907,1,0.98841


In [43]:
df2.rename(columns={'Sentiment_Score': 'VADER_Sentiment_Score'}, inplace=True)

In [44]:
df2.head()

Unnamed: 0,Title,Author,Date,Stars(out_of_10),Review,VADER_Sentiment_Score,Polarity,BERT_Sentiment_Score
0,Some birds aren't meant to be caged.,hitchcockthelegend,24 July 2010,10.0,the shawshank redemption is written and direct...,0.9983,1,0.981635
1,An incredible movie. One that lives with you.,Sleepin_Dragon,17 February 2021,10.0,it is no wonder that the film has such a high ...,0.9766,1,0.987899
2,Don't Rent Shawshank.,EyeDunno,21 November 2005,10.0,im trying to save you money this is the last f...,0.9935,1,0.981112
3,A classic piece of unforgettable film-making.,kaspen12,10 February 2006,10.0,in its oscar year shawshank redemption written...,0.9993,1,0.986902
4,This is How Movies Should Be Made,alexkolokotronis,18 February 2008,10.0,this movie is not your ordinary hollywood flic...,0.9907,1,0.98841


In [45]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24845 entries, 0 to 24850
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Title                  24844 non-null  object 
 1   Author                 24845 non-null  object 
 2   Date                   24845 non-null  object 
 3   Stars(out_of_10)       23872 non-null  float64
 4   Review                 24845 non-null  object 
 5   VADER_Sentiment_Score  24845 non-null  float64
 6   Polarity               24845 non-null  int64  
 7   BERT_Sentiment_Score   24845 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 1.7+ MB


In [46]:
df2['Sentiment_Score'] = df2[['VADER_Sentiment_Score', 'BERT_Sentiment_Score']].mean(axis=1)

In [47]:
df2.head()

Unnamed: 0,Title,Author,Date,Stars(out_of_10),Review,VADER_Sentiment_Score,Polarity,BERT_Sentiment_Score,Sentiment_Score
0,Some birds aren't meant to be caged.,hitchcockthelegend,24 July 2010,10.0,the shawshank redemption is written and direct...,0.9983,1,0.981635,0.989968
1,An incredible movie. One that lives with you.,Sleepin_Dragon,17 February 2021,10.0,it is no wonder that the film has such a high ...,0.9766,1,0.987899,0.982249
2,Don't Rent Shawshank.,EyeDunno,21 November 2005,10.0,im trying to save you money this is the last f...,0.9935,1,0.981112,0.987306
3,A classic piece of unforgettable film-making.,kaspen12,10 February 2006,10.0,in its oscar year shawshank redemption written...,0.9993,1,0.986902,0.993101
4,This is How Movies Should Be Made,alexkolokotronis,18 February 2008,10.0,this movie is not your ordinary hollywood flic...,0.9907,1,0.98841,0.989555


In [48]:
def get_class(score):
    if score>0:
        return 1
    else:
        return 0

In [49]:
df2['Polarity'] = df2['Sentiment_Score'].apply(get_class)

In [50]:
df2 = df2[['Title', 'Author', 'Date', 'Stars(out_of_10)', 'Review', 'VADER_Sentiment_Score', 'BERT_Sentiment_Score', 'Sentiment_Score', 'Polarity']]

In [51]:
df2.head(15)

Unnamed: 0,Title,Author,Date,Stars(out_of_10),Review,VADER_Sentiment_Score,BERT_Sentiment_Score,Sentiment_Score,Polarity
0,Some birds aren't meant to be caged.,hitchcockthelegend,24 July 2010,10.0,the shawshank redemption is written and direct...,0.9983,0.981635,0.989968,1
1,An incredible movie. One that lives with you.,Sleepin_Dragon,17 February 2021,10.0,it is no wonder that the film has such a high ...,0.9766,0.987899,0.982249,1
2,Don't Rent Shawshank.,EyeDunno,21 November 2005,10.0,im trying to save you money this is the last f...,0.9935,0.981112,0.987306,1
3,A classic piece of unforgettable film-making.,kaspen12,10 February 2006,10.0,in its oscar year shawshank redemption written...,0.9993,0.986902,0.993101,1
4,This is How Movies Should Be Made,alexkolokotronis,18 February 2008,10.0,this movie is not your ordinary hollywood flic...,0.9907,0.98841,0.989555,1
5,Prepare to be moved,speedreid,8 February 2001,,i have never seen such an amazing film since i...,0.9712,0.988236,0.979718,1
6,Shawshank Redeems Hollywood,weswalker,27 August 2002,10.0,can hollywood usually creating things for ente...,0.9994,0.985542,0.992471,1
7,the shawshank redemption,auuwws,8 October 2020,10.0,the best movie in history and the best ending ...,0.9042,0.926155,0.915177,1
8,Eternal Hope,nowego,15 June 2018,10.0,ive lost count of the number of times i have s...,0.9943,0.988509,0.991404,1
9,The Shawshank Redemption,Coxer99,25 March 1999,10.0,one of the finest films made in recent years i...,0.9829,0.988651,0.985775,1


In [52]:
df2.to_csv('vader_dbert_scored_augmented.csv')