In [1]:
# import libraries
import numpy as np
import pandas as pd

In [31]:
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')

In [32]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [33]:
len(df)

2000

In [34]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [35]:
# there are 35 missing values (not available)
df.dropna(inplace=True)
df.isnull().sum()

label     0
review    0
dtype: int64

In [36]:
# check for empty stirngs

blanks = []

for i, lb, rw in df.itertuples():
    if type(rw) == str:
        if rw.isspace():
            blanks.append(i)
print(len(blanks))

27


In [37]:
df.drop(blanks, inplace=True)
len(df)

1938

In [39]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [40]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [42]:
analyzer = SentimentIntensityAnalyzer()

In [44]:
pos_rw = []
neg_rw = []

for i,lb,rw in df.itertuples():
    scores = analyzer.polarity_scores(rw)
    if scores['compound'] >= 0:
        pos_rw.append(scores['compound'])
    else:
        neg_rw.append(scores['compound'])

In [49]:
print(f'positive reviews: {len(pos_rw)}, negative reviews: {len(neg_rw)}')

positive reviews: 1349, negative reviews: 589


In [50]:
# create a 'score' column

df['scores'] = df['review'].apply(lambda review: analyzer.polarity_scores(review))

In [51]:
df.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com..."


In [52]:
df['compound'] = df['scores'].apply(lambda dictionary: dictionary['compound'])

In [53]:
df.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264


In [56]:
df['comp_label'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')

In [59]:
df.head()

Unnamed: 0,label,review,scores,compound,com_label,comp_label
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg,neg


In [60]:
# del stands for 'delate' and it delates a column
del df['com_label']

In [61]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_label
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg


In [62]:
# compare our comp_label against the manual label

In [63]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [81]:
accuracy = accuracy_score(df['label'],df['comp_label'])
print('{:.2f}'.format(accuracy))

0.64


In [85]:
report = classification_report(df['label'],df['comp_label'])

In [84]:
print(report)

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

   micro avg       0.64      0.64      0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [86]:
c_matrix = confusion_matrix(df['label'],df['comp_label'])

In [87]:
print(c_matrix)

[[427 542]
 [162 807]]
