# *Imports*


In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# *Load Data , train on first 500 examples only because of the computation cost of the roberta model*

In [None]:
df = pd.read_csv('IMDB Dataset.csv')
df = df.head(100)


In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
print(df.shape)


(100, 2)


# *Some Ground level NLTK*

In [None]:
example = df['review'][50]


In [None]:
example

'Return to the 36th Chamber is one of those classic Kung-Fu movies which Shaw produces back in the 70s and 80s, whose genre is equivalent to the spaghetti westerns of Hollywood, and the protagonist Gordon Liu, the counterpart to the western\'s Clint Eastwood. Digitally remastered and a new print made for the Fantastic Film Fest, this is "Presented in Shaw Scope", just like the good old days.<br /><br />This film is a simple story of good versus evil, told in 3 acts, which more or less sums up the narrative of martial arts films in that era.<br /><br />Act One sets up the premise. Workers in a dye-mill of a small village are unhappy with their lot, having their wages cut by 20% by incoming manchu gangsters. They can\'t do much about their exploitation because none of them are martial arts skilled to take on the gangsters, and their boss. At first they had a minor success in getting Liu to impersonate a highly skilled Shaolin monk (one of the best comedy sequences), but their rouse got e

**Tokenization of the words and different syllables**

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:20]

['Return',
 'to',
 'the',
 '36th',
 'Chamber',
 'is',
 'one',
 'of',
 'those',
 'classic',
 'Kung-Fu',
 'movies',
 'which',
 'Shaw',
 'produces',
 'back',
 'in',
 'the',
 '70s',
 'and']

**Tagging the different Tokens**

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:20]

[('Return', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('36th', 'CD'),
 ('Chamber', 'NNP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('those', 'DT'),
 ('classic', 'JJ'),
 ('Kung-Fu', 'NNP'),
 ('movies', 'NNS'),
 ('which', 'WDT'),
 ('Shaw', 'NNP'),
 ('produces', 'VBZ'),
 ('back', 'RB'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('70s', 'CD'),
 ('and', 'CC')]

**Putting Tagged Tokens into different Chunks (groups)**

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

(S
  (GPE Return/NN)
  to/TO
  the/DT
  36th/CD
  Chamber/NNP
  is/VBZ
  one/CD
  of/IN
  those/DT
  classic/JJ
  Kung-Fu/NNP
  movies/NNS
  which/WDT
  (PERSON Shaw/NNP)
  produces/VBZ
  back/RB
  in/IN
  the/DT
  70s/CD
  and/CC
  80s/CD
  ,/,
  whose/WP$
  genre/NN
  is/VBZ
  equivalent/JJ
  to/TO
  the/DT
  spaghetti/NNS
  westerns/NNS
  of/IN
  (GPE Hollywood/NNP)
  ,/,
  and/CC
  the/DT
  protagonist/NN
  (PERSON Gordon/NNP Liu/NNP)
  ,/,
  the/DT
  counterpart/NN
  to/TO
  the/DT
  western/NN
  's/POS
  (PERSON Clint/NNP Eastwood/NNP)
  ./.
  (PERSON Digitally/NNP)
  remastered/VBD
  and/CC
  a/DT
  new/JJ
  print/NN
  made/VBN
  for/IN
  the/DT
  (ORGANIZATION Fantastic/NNP Film/NNP Fest/NNP)
  ,/,
  this/DT
  is/VBZ
  ``/``
  Presented/VBN
  in/IN
  (GPE Shaw/NNP)
  Scope/NNP
  ''/''
  ,/,
  just/RB
  like/IN
  the/DT
  good/JJ
  old/JJ
  days./NN
  </NNP
  br/NN
  //NNP
  >/NNP
  </NNP
  br/NN
  //NNP
  >/NNP
  This/DT
  film/NN
  is/VBZ
  a/DT
  simple/JJ
  story/NN
  of/IN


# *Old School Approach : VADER Seniment Scoring*

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**Model Vibe Check !**

In [None]:
sia.polarity_scores('VIVA TOUNIZI LETS GOOOO')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [None]:
sia.polarity_scores('FRANCA MA TERBA7CH >:(')

{'neg': 0.552, 'neu': 0.448, 'pos': 0.0, 'compound': -0.5719}

# *VADER Model on the whole dataset*

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['review']
    myid = i
    res[i] = sia.polarity_scores(text)
res

  0%|          | 0/100 [00:00<?, ?it/s]

{0: {'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'compound': -0.9951},
 1: {'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'compound': 0.9641},
 2: {'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'compound': 0.9605},
 3: {'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'compound': -0.9213},
 4: {'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'compound': 0.9744},
 5: {'neg': 0.017, 'neu': 0.758, 'pos': 0.225, 'compound': 0.9828},
 6: {'neg': 0.024, 'neu': 0.871, 'pos': 0.104, 'compound': 0.9022},
 7: {'neg': 0.149, 'neu': 0.654, 'pos': 0.197, 'compound': 0.8596},
 8: {'neg': 0.166, 'neu': 0.662, 'pos': 0.172, 'compound': 0.2362},
 9: {'neg': 0.094, 'neu': 0.531, 'pos': 0.375, 'compound': 0.9149},
 10: {'neg': 0.084, 'neu': 0.696, 'pos': 0.221, 'compound': 0.9482},
 11: {'neg': 0.107, 'neu': 0.779, 'pos': 0.114, 'compound': 0.5223},
 12: {'neg': 0.145, 'neu': 0.75, 'pos': 0.105, 'compound': -0.9721},
 13: {'neg': 0.086, 'neu': 0.795, 'pos': 0.12, 'compound': 0.3425},
 14: {'neg': 0.117, 'neu': 0.713, 'pos': 0.

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Movie Id'})
vaders

Unnamed: 0,Movie Id,neg,neu,pos,compound
0,0,0.203,0.748,0.048,-0.9951
1,1,0.053,0.776,0.172,0.9641
2,2,0.094,0.714,0.192,0.9605
3,3,0.138,0.797,0.065,-0.9213
4,4,0.052,0.801,0.147,0.9744
...,...,...,...,...,...
95,95,0.086,0.735,0.179,0.9826
96,96,0.167,0.735,0.099,-0.9434
97,97,0.324,0.547,0.130,-0.9882
98,98,0.203,0.729,0.068,-0.9876


# *Installation of Transformers*

In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 39.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


# *Modern Approach : Roberta Sentiment Analysis*

# *Download of the Pre-trained Weights of the Roberta Model*

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

# *Tokenization + Fitting the model on our data*
Transforming the output to a numpy array in order to perform a softmax activation function 

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'Roberta_Negative_Review' : scores[0],
        'Roberta_Neutral_Review' : scores[1],
        'Roberta_Positive_Review' : scores[2]
    }
    return scores_dict

**Model Vibe Check !**  

In [None]:
polarity_scores_roberta("VIVA TOUNIZI LETS GOOOO")

{'Roberta_Negative_Review': 0.01409524,
 'Roberta_Neutral_Review': 0.26885659,
 'Roberta_Positive_Review': 0.71704817}

In [None]:
polarity_scores_roberta("FRANCA MA TERBA7CH >:(")

{'Roberta_Negative_Review': 0.5431578,
 'Roberta_Neutral_Review': 0.43174478,
 'Roberta_Positive_Review': 0.025097357}

In [None]:
polarity_scores_roberta("n3adlou maa franca nerb7ou australia netrach7ou !")

{'Roberta_Negative_Review': 0.07010637,
 'Roberta_Neutral_Review': 0.8533582,
 'Roberta_Positive_Review': 0.07653542}

# *Iterating Over the Dataset And Analysing each review*
the roberta models breaks on some iterations due to long reviews

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['review']
        myid = i
               
        roberta_result = polarity_scores_roberta(text)
        
        res[i] = roberta_result
    except RuntimeError:
        print(f'Broke for id {i}')

  0%|          | 0/100 [00:00<?, ?it/s]

Broke for id 12
Broke for id 26
Broke for id 29
Broke for id 30
Broke for id 33
Broke for id 48
Broke for id 50
Broke for id 51
Broke for id 58
Broke for id 59
Broke for id 77
Broke for id 92
Broke for id 99


In [None]:
res

{0: {'Roberta_Negative_Review': 0.33979195,
  'Roberta_Neutral_Review': 0.46273753,
  'Roberta_Positive_Review': 0.19747041},
 1: {'Roberta_Negative_Review': 0.004337342,
  'Roberta_Neutral_Review': 0.039048094,
  'Roberta_Positive_Review': 0.95661455},
 2: {'Roberta_Negative_Review': 0.08847477,
  'Roberta_Neutral_Review': 0.26314417,
  'Roberta_Positive_Review': 0.6483811},
 3: {'Roberta_Negative_Review': 0.7551167,
  'Roberta_Neutral_Review': 0.21164009,
  'Roberta_Positive_Review': 0.033243094},
 4: {'Roberta_Negative_Review': 0.024545873,
  'Roberta_Neutral_Review': 0.22359842,
  'Roberta_Positive_Review': 0.75185555},
 5: {'Roberta_Negative_Review': 0.006577765,
  'Roberta_Neutral_Review': 0.03370466,
  'Roberta_Positive_Review': 0.9597175},
 6: {'Roberta_Negative_Review': 0.05034035,
  'Roberta_Neutral_Review': 0.39188135,
  'Roberta_Positive_Review': 0.5577783},
 7: {'Roberta_Negative_Review': 0.8877086,
  'Roberta_Neutral_Review': 0.090388015,
  'Roberta_Positive_Review': 0.02


# ***Comparison of Vader and roBERTa***

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['review']
        myid = i
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {i}')

res

  0%|          | 0/100 [00:00<?, ?it/s]

Broke for id 12
Broke for id 26
Broke for id 29
Broke for id 30
Broke for id 33
Broke for id 48
Broke for id 50
Broke for id 51
Broke for id 58
Broke for id 59
Broke for id 77
Broke for id 92
Broke for id 99


{0: {'vader_neg': 0.203,
  'vader_neu': 0.748,
  'vader_pos': 0.048,
  'vader_compound': -0.9951,
  'Roberta_Negative_Review': 0.33979195,
  'Roberta_Neutral_Review': 0.46273753,
  'Roberta_Positive_Review': 0.19747041},
 1: {'vader_neg': 0.053,
  'vader_neu': 0.776,
  'vader_pos': 0.172,
  'vader_compound': 0.9641,
  'Roberta_Negative_Review': 0.004337342,
  'Roberta_Neutral_Review': 0.039048094,
  'Roberta_Positive_Review': 0.95661455},
 2: {'vader_neg': 0.094,
  'vader_neu': 0.714,
  'vader_pos': 0.192,
  'vader_compound': 0.9605,
  'Roberta_Negative_Review': 0.08847477,
  'Roberta_Neutral_Review': 0.26314417,
  'Roberta_Positive_Review': 0.6483811},
 3: {'vader_neg': 0.138,
  'vader_neu': 0.797,
  'vader_pos': 0.065,
  'vader_compound': -0.9213,
  'Roberta_Negative_Review': 0.7551167,
  'Roberta_Neutral_Review': 0.21164009,
  'Roberta_Positive_Review': 0.033243094},
 4: {'vader_neg': 0.052,
  'vader_neu': 0.801,
  'vader_pos': 0.147,
  'vader_compound': 0.9744,
  'Roberta_Negative_

# *Review Sentiment Analysis Results (roBERTa vs Vader)*

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Movie Id'})
results_df

Unnamed: 0,Movie Id,vader_neg,vader_neu,vader_pos,vader_compound,Roberta_Negative_Review,Roberta_Neutral_Review,Roberta_Positive_Review
0,0,0.203,0.748,0.048,-0.9951,0.339792,0.462738,0.197470
1,1,0.053,0.776,0.172,0.9641,0.004337,0.039048,0.956615
2,2,0.094,0.714,0.192,0.9605,0.088475,0.263144,0.648381
3,3,0.138,0.797,0.065,-0.9213,0.755117,0.211640,0.033243
4,4,0.052,0.801,0.147,0.9744,0.024546,0.223598,0.751856
...,...,...,...,...,...,...,...,...
82,94,0.033,0.843,0.125,0.9272,0.422049,0.468263,0.109688
83,95,0.086,0.735,0.179,0.9826,0.038288,0.177486,0.784227
84,96,0.167,0.735,0.099,-0.9434,0.411436,0.412445,0.176119
85,97,0.324,0.547,0.130,-0.9882,0.842104,0.120516,0.037380


# *Shawshank Redemption Example (roBERTa)*

***“GET BUSY LIVIN',OR GET BUSY DYIN'”***

In [None]:
df_shawshank = pd.read_csv('Shawshank.csv')
df_shawshank

Unnamed: 0.1,Unnamed: 0,Sentiments
0,The Shawshank Redemption is written and direct...,Some birds aren't meant to be caged.
1,It is no wonder that the film has such a high ...,An incredible movie. One that lives with you.
2,I'm trying to save you money; this is the last...,Don't Rent Shawshank.
3,This movie is not your ordinary Hollywood flic...,This is How Movies Should Be Made
4,"In its Oscar year, Shawshank Redemption (writt...",A classic piece of unforgettable film-making.
5,One of the finest films made in recent years. ...,The Shawshank Redemption
6,Misery and Stand By Me were the best adaptatio...,Stephen King's best adapted movie
7,I've lost count of the number of times I have ...,Eternal Hope
8,Two imprisoned men (Tim Robbins and Morgan Fre...,IMDb and the Greatest Film of All Time
9,"Based on a novella by Stephen King, this is be...",All-time prison film classic


In [None]:
res_shawshank = {}
for i, row in tqdm(df_shawshank.iterrows(), total=len(df_shawshank)):
    try:
        text = row['Sentiments']
        myid = i
               
        roberta_result_shawshank = polarity_scores_roberta(text)
        
        res_shawshank[i] = roberta_result_shawshank
    except RuntimeError:
        print(f'Broke for id {i}')



  0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
results_df_shawshank = pd.DataFrame(res_shawshank).T
results_df_shawshank = results_df_shawshank.reset_index().rename(columns={'index': 'Review Id'})
results_df_shawshank

Unnamed: 0,Review Id,Roberta_Negative_Review,Roberta_Neutral_Review,Roberta_Positive_Review
0,0,0.548712,0.431967,0.01932
1,1,0.001931,0.030211,0.967857
2,2,0.32621,0.642881,0.030909
3,3,0.038695,0.441816,0.51949
4,4,0.00658,0.153588,0.839832
5,5,0.045,0.848058,0.106942
6,6,0.003044,0.060496,0.936461
7,7,0.012179,0.455073,0.532748
8,8,0.007781,0.210001,0.782218
9,9,0.046544,0.555745,0.397711


# *Now let us use the pipeline from transformers*

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

**The Ultimate Vibe Test !**

In [None]:
sent_pipeline("TOUNIZI WINS THE WORLD CUP GALLOU FRANCA W GALLOU DENMARK")
sent_pipeline("AMDOUNI BEST WEHED FEL MUNDO")

[{'label': 'POSITIVE', 'score': 0.94375079870224}]