# Sentiment Analysis Project


## Load the Data

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/moviereviews.tsv", sep="\t")

# Show first 5 rows
print(df.head())

  label                                             review
0   neg  how do films like mouse hunt get into theatres...
1   neg  some talented actresses are blessed with a dem...
2   pos  this has been an extraordinary year for austra...
3   pos  according to hollywood movies made in last few...
4   neg  my first press screening of 1998 and already i...


In [4]:
df.shape

(2000, 2)

## Remove Blank Records (optional)

In [5]:
# Remove rows where 'review' is NaN (missing)
df = df.dropna(subset=["review"])

# Remove rows where 'review' is just empty string or spaces
df = df[df["review"].str.strip() != ""]

print(df.shape)   # Check new number of rows

(1938, 2)


In [6]:
print(df.head())  # Preview cleaned data

  label                                             review
0   neg  how do films like mouse hunt get into theatres...
1   neg  some talented actresses are blessed with a dem...
2   pos  this has been an extraordinary year for austra...
3   pos  according to hollywood movies made in last few...
4   neg  my first press screening of 1998 and already i...


## Import `SentimentIntensityAnalyzer` and create an sid object
This assumes that the VADER lexicon has been downloaded.

In [7]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize
sid = SentimentIntensityAnalyzer()

## Use sid to append a `comp_score` to the dataset

In [49]:
df['review'][0]

'how do films like mouse hunt get into theatres ? \r\nisn\'t there a law or something ? \r\nthis diabolical load of claptrap from steven speilberg\'s dreamworks studio is hollywood family fare at its deadly worst . \r\nmouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . \r\nwriter adam rifkin and director gore verbinski are the names chiefly responsible for this swill . \r\nthe plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . \r\ndeciding to check out the long-abandoned house , they soon learn that it\'s worth a fortune and set about selling it in auction to the highest bidder . \r\nbut battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . \r\

In [50]:
sid.polarity_scores(df['review'][0])

{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'compound': -0.9125}

In [51]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [52]:
df.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [53]:
df['comp_score'] = df['scores'].apply(lambda score_dict : score_dict['compound'])

In [54]:
df.head()

Unnamed: 0,label,review,scores,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484


In [56]:
df['comp_score_label'] = df['comp_score'].apply(lambda s : 'pos' if s >= 0  else 'neg')

In [57]:
df.head()

Unnamed: 0,label,review,scores,comp_score,comp_score_label
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


## Perform a comparison analysis between the original `label` and `comp_score`

**Comparing the number of pos and neg in label VS comp_score_label**

In [58]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
neg,969
pos,969


In [59]:
df['comp_score_label'].value_counts()

Unnamed: 0_level_0,count
comp_score_label,Unnamed: 1_level_1
pos,1347
neg,591


In [60]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [61]:
accuracy_score(df['label'],df['comp_score_label'])

0.6357069143446853

In [62]:
print(classification_report(df['label'],df['comp_score_label']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938

