## Q1

In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


In [2]:
df = pd.read_table("MovieReview-Sample.csv",header = None, names = ['Review'])
df.head()

Unnamed: 0,Review
0,"1,""films adapted from comic books have had ple..."
1,"2,""every now and then a movie comes along from..."
2,"3,""you've got mail works alot better than it d..."
3,"4,"" jaws is a rare film that grabs your att..."
4,"5,""moviemaking is a lot like being the general..."


In [3]:
df['Review'][1]

'2,"every now and then a movie comes along from a suspect studio , with every indication that it will be a stinker , and to everybody\'s surprise ( perhaps even the studio ) the film becomes a critical darling . mtv films\' _election , a high school comedy starring matthew broderick and reese witherspoon , is a current example . did anybody know this film existed a week before it opened ? the plot is deceptively simple . george washington carver high school is having student elections . tracy flick ( reese witherspoon ) is an over-achiever with her hand raised at nearly every question , way , way , high . mr .   m   ( matthew broderick ) , sick of the megalomaniac student , encourages paul , a popular-but-slow jock to run . and paul\'s nihilistic sister jumps in the race as well , for personal reasons . the dark side of such sleeper success is that , because expectations were so low going in , the fact that this was quality stuff made the reviews even more enthusiastic than they have a

In [4]:
df

Unnamed: 0,Review
0,"1,""films adapted from comic books have had ple..."
1,"2,""every now and then a movie comes along from..."
2,"3,""you've got mail works alot better than it d..."
3,"4,"" jaws is a rare film that grabs your att..."
4,"5,""moviemaking is a lot like being the general..."
...,...
1995,"1996,""if anything , stigmata should be tak..."
1996,"1997,""john boorman's zardoz is a goofy cin..."
1997,"1998,""the kids in the hall are an acquired tas..."
1998,"1999,""there was a time when john carpenter was..."


In [5]:
df.dtypes

Review    object
dtype: object

In [6]:
data = df.Review.str.lower()

In [7]:
def count_pos_neg(data, positive_dict, negative_dict):
# count of positive and negative words that appeared in each message
# net count which is calculated by positive count subtracting negative count. 
    poscnt = []
    negcnt = []
    netcnt = []

    for nrow in range(0,len(data)):
        text = data[nrow]
        
        qa = 0
        qb = 0

        for word in positive_dict :
            if (word in text) :
                qa = qa + 1

        for word in negative_dict :
            if (word in text) :
                qb = qb + 1

        qc = qa - qb

        poscnt.append(qa)
        negcnt.append(qb)
        netcnt.append(qc)

    return (poscnt, negcnt, netcnt)

## 1 - Bing Liu's Lexicon

In [8]:
import nltk
nltk.download("opinion_lexicon")

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/nicolasclarke/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [9]:
from nltk.corpus import opinion_lexicon

In [10]:
pos_list_BL=set(opinion_lexicon.positive())
neg_list_BL=set(opinion_lexicon.negative())

In [11]:
df['poscnt_BL'], df['negcnt_BL'], df['netcnt_BL'] = count_pos_neg(data, pos_list_BL, neg_list_BL)

In [12]:
df[['Review','poscnt_BL','negcnt_BL','netcnt_BL']].head(5)

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL
0,"1,""films adapted from comic books have had ple...",16,36,-20
1,"2,""every now and then a movie comes along from...",21,33,-12
2,"3,""you've got mail works alot better than it d...",32,28,4
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21
4,"5,""moviemaking is a lot like being the general...",17,24,-7


In [13]:
dfnew = pd.read_csv("MovieReview-Sample.csv",header = None)
dfnew.head()

Unnamed: 0,0,1,2
0,1,films adapted from comic books have had plenty...,1
1,2,every now and then a movie comes along from a ...,1
2,3,you've got mail works alot better than it dese...,1
3,4,jaws is a rare film that grabs your attent...,1
4,5,moviemaking is a lot like being the general ma...,1


In [14]:
df['Score'] = dfnew[2].astype(int)

In [15]:
df

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL,Score
0,"1,""films adapted from comic books have had ple...",16,36,-20,1
1,"2,""every now and then a movie comes along from...",21,33,-12,1
2,"3,""you've got mail works alot better than it d...",32,28,4,1
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21,1
4,"5,""moviemaking is a lot like being the general...",17,24,-7,1
...,...,...,...,...,...
1995,"1996,""if anything , stigmata should be tak...",14,41,-27,0
1996,"1997,""john boorman's zardoz is a goofy cin...",28,36,-8,0
1997,"1998,""the kids in the hall are an acquired tas...",19,21,-2,0
1998,"1999,""there was a time when john carpenter was...",19,37,-18,0


In [16]:
df.loc[df['netcnt_BL'] >= 0, 'predictBL'] = 1
df.loc[df['netcnt_BL'] < 0, 'predictBL'] = 0
df['predictBL'] = df['predictBL'].astype('int')

In [17]:
df.head()

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL,Score,predictBL
0,"1,""films adapted from comic books have had ple...",16,36,-20,1,0
1,"2,""every now and then a movie comes along from...",21,33,-12,1,0
2,"3,""you've got mail works alot better than it d...",32,28,4,1,1
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21,1,0
4,"5,""moviemaking is a lot like being the general...",17,24,-7,1,0


In [18]:
confusion_matrix = pd.crosstab(df['Score'], df['predictBL'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted    0    1
Actual             
0          955   45
1          842  158


In [19]:
PrecisionBL = precision_score(df['Score'], df['predictBL'])
RecallBL = recall_score(df['Score'], df['predictBL'])
F1BL = f1_score(df['Score'], df['predictBL'])

In [20]:
print('Precision BL = ', round(PrecisionBL,3) )
print('Recall BL = ', round(RecallBL,3) )
print('F1 BL = ', round(F1BL,3) )

Precision BL =  0.778
Recall BL =  0.158
F1 BL =  0.263


In [24]:
print(classification_report(df['Score'], df['predictBL']))


              precision    recall  f1-score   support

           0       0.53      0.95      0.68      1000
           1       0.78      0.16      0.26      1000

    accuracy                           0.56      2000
   macro avg       0.65      0.56      0.47      2000
weighted avg       0.65      0.56      0.47      2000



## 2 - LM Dictionary

In [25]:
def read_local_dictionary(file):
    # create dictionary list
    words_dict = []
    with open(file, "r") as f: 
        for line in f:
            t = line.strip().lower()
            words_dict.append(t)
    return words_dict

In [26]:
pos_list_LM = read_local_dictionary('positive-words-LM.txt')
neg_list_LM = read_local_dictionary('negative-words-LM.txt')

In [27]:
df['poscnt_LM'], df['negcnt_LM'], df['netcnt_LM'] = count_pos_neg(data, pos_list_LM, neg_list_LM)

In [28]:
df[['Review','poscnt_LM','negcnt_LM','netcnt_LM']].head(5)

Unnamed: 0,Review,poscnt_LM,negcnt_LM,netcnt_LM
0,"1,""films adapted from comic books have had ple...",5,11,-6
1,"2,""every now and then a movie comes along from...",7,12,-5
2,"3,""you've got mail works alot better than it d...",12,7,5
3,"4,"" jaws is a rare film that grabs your att...",5,14,-9
4,"5,""moviemaking is a lot like being the general...",1,6,-5


In [29]:
dfnew = pd.read_csv("MovieReview-Sample.csv",header = None)

In [30]:
df['Score'] = dfnew[2].astype(int)

In [31]:
df.head()

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL,Score,predictBL,poscnt_LM,negcnt_LM,netcnt_LM
0,"1,""films adapted from comic books have had ple...",16,36,-20,1,0,5,11,-6
1,"2,""every now and then a movie comes along from...",21,33,-12,1,0,7,12,-5
2,"3,""you've got mail works alot better than it d...",32,28,4,1,1,12,7,5
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21,1,0,5,14,-9
4,"5,""moviemaking is a lot like being the general...",17,24,-7,1,0,1,6,-5


In [32]:
df.loc[df['netcnt_LM'] >= 0, 'predictLM'] = 1
df.loc[df['netcnt_LM'] < 0, 'predictLM'] = 0
df['predictLM'] = df['predictLM'].astype('int')

In [33]:
confusion_matrix = pd.crosstab(df['Score'], df['predictLM'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted    0    1
Actual             
0          897  103
1          786  214


In [34]:
PrecisionLM = precision_score(df['Score'], df['predictLM'])
RecallLM = recall_score(df['Score'], df['predictLM'])
F1LM = f1_score(df['Score'], df['predictLM'])

In [35]:
print('Precision LM = ', round(PrecisionLM,3) )
print('Recall LM = ', RecallLM )
print('F1 LM = ', round(F1LM,3) )

Precision LM =  0.675
Recall LM =  0.214
F1 LM =  0.325


In [36]:
print(classification_report(df['Score'], df['predictLM']))

              precision    recall  f1-score   support

           0       0.53      0.90      0.67      1000
           1       0.68      0.21      0.32      1000

    accuracy                           0.56      2000
   macro avg       0.60      0.56      0.50      2000
weighted avg       0.60      0.56      0.50      2000



## 3 - TextBlob

In [37]:
from textblob import TextBlob

In [38]:
df["score_TextBlob"] = df["Review"].map(lambda x:TextBlob(x).sentiment.polarity)


In [39]:
df[["Review","score_TextBlob"]]

Unnamed: 0,Review,score_TextBlob
0,"1,""films adapted from comic books have had ple...",-0.061036
1,"2,""every now and then a movie comes along from...",0.088390
2,"3,""you've got mail works alot better than it d...",0.081941
3,"4,"" jaws is a rare film that grabs your att...",0.066679
4,"5,""moviemaking is a lot like being the general...",0.054987
...,...,...
1995,"1996,""if anything , stigmata should be tak...",-0.099168
1996,"1997,""john boorman's zardoz is a goofy cin...",0.008737
1997,"1998,""the kids in the hall are an acquired tas...",0.168962
1998,"1999,""there was a time when john carpenter was...",0.097989


In [40]:
dfnew = pd.read_csv("MovieReview-Sample.csv",header = None)

In [41]:
df['Score'] = dfnew[2].astype(int)

In [42]:
df.head()

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL,Score,predictBL,poscnt_LM,negcnt_LM,netcnt_LM,predictLM,score_TextBlob
0,"1,""films adapted from comic books have had ple...",16,36,-20,1,0,5,11,-6,0,-0.061036
1,"2,""every now and then a movie comes along from...",21,33,-12,1,0,7,12,-5,0,0.08839
2,"3,""you've got mail works alot better than it d...",32,28,4,1,1,12,7,5,1,0.081941
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21,1,0,5,14,-9,0,0.066679
4,"5,""moviemaking is a lot like being the general...",17,24,-7,1,0,1,6,-5,0,0.054987


In [43]:
df.loc[df['score_TextBlob'] >= 0, 'predict_textblob'] = 1
df.loc[df['score_TextBlob'] < 0, 'predict_textblob'] = 0
df['predict_textblob'] = df['predict_textblob'].astype('int')

In [44]:
confusion_matrix = pd.crosstab(df['Score'], df['predict_textblob'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted    0    1
Actual             
0          274  726
1           79  921


In [45]:
Precisiontextblob = precision_score(df['Score'], df['predict_textblob'])
Recalltextblob = recall_score(df['Score'], df['predict_textblob'])
F1textblob = f1_score(df['Score'], df['predict_textblob'])

In [46]:
print('Precision TextBlob = ', round(Precisiontextblob,3) )
print('Recall TextBlob = ', Recalltextblob )
print('F1 TextBlob = ', round(F1textblob,3) )

Precision TextBlob =  0.559
Recall TextBlob =  0.921
F1 TextBlob =  0.696


In [47]:
print(classification_report(df['Score'], df['predict_textblob']))

              precision    recall  f1-score   support

           0       0.78      0.27      0.41      1000
           1       0.56      0.92      0.70      1000

    accuracy                           0.60      2000
   macro avg       0.67      0.60      0.55      2000
weighted avg       0.67      0.60      0.55      2000



## 4 - Vader

In [48]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [49]:
analyzer = SentimentIntensityAnalyzer()
scores = [analyzer.polarity_scores(sentence) for sentence in data]

In [50]:
neg_s = [i["neg"] for i in scores]
neu_s = [i["neu"] for i in scores]
pos_s = [i["pos"] for i in scores]
compound_s = [i["compound"] for i in scores]

In [51]:
df['negscore_Vader'], df['neuscore_Vader'], df['posscore_Vader'], df['compound_Vader'] = neg_s, neu_s, pos_s, compound_s

In [52]:
df[['Review','negscore_Vader','neuscore_Vader','posscore_Vader','compound_Vader']].head(5)

Unnamed: 0,Review,negscore_Vader,neuscore_Vader,posscore_Vader,compound_Vader
0,"1,""films adapted from comic books have had ple...",0.138,0.802,0.06,-0.9905
1,"2,""every now and then a movie comes along from...",0.069,0.833,0.098,0.8319
2,"3,""you've got mail works alot better than it d...",0.075,0.765,0.16,0.9887
3,"4,"" jaws is a rare film that grabs your att...",0.085,0.806,0.109,0.9373
4,"5,""moviemaking is a lot like being the general...",0.037,0.849,0.114,0.9819


In [53]:
dfnew = pd.read_csv("MovieReview-Sample.csv",header = None)

In [54]:
df['Score'] = dfnew[2].astype(int)

In [55]:
df.head()

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL,Score,predictBL,poscnt_LM,negcnt_LM,netcnt_LM,predictLM,score_TextBlob,predict_textblob,negscore_Vader,neuscore_Vader,posscore_Vader,compound_Vader
0,"1,""films adapted from comic books have had ple...",16,36,-20,1,0,5,11,-6,0,-0.061036,0,0.138,0.802,0.06,-0.9905
1,"2,""every now and then a movie comes along from...",21,33,-12,1,0,7,12,-5,0,0.08839,1,0.069,0.833,0.098,0.8319
2,"3,""you've got mail works alot better than it d...",32,28,4,1,1,12,7,5,1,0.081941,1,0.075,0.765,0.16,0.9887
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21,1,0,5,14,-9,0,0.066679,1,0.085,0.806,0.109,0.9373
4,"5,""moviemaking is a lot like being the general...",17,24,-7,1,0,1,6,-5,0,0.054987,1,0.037,0.849,0.114,0.9819


In [56]:
df.loc[df['compound_Vader'] >= 0, 'predict_Vader'] = 1
df.loc[df['compound_Vader'] < 0, 'predict_Vader'] = 0
df['predict_Vader'] = df['predict_Vader'].astype('int')

In [57]:
confusion_matrix = pd.crosstab(df['Score'], df['predict_Vader'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted    0    1
Actual             
0          490  510
1          267  733


In [58]:
Precisionvader = precision_score(df['Score'], df['predict_Vader'])
Recallvader = recall_score(df['Score'], df['predict_Vader'])
F1vader = f1_score(df['Score'], df['predict_Vader'])

In [59]:
print('Precision Vader = ', round(Precisionvader,3) )
print('Recall Vader = ', Recallvader )
print('F1 Vader = ', round(F1vader,3) )

Precision Vader =  0.59
Recall Vader =  0.733
F1 Vader =  0.654


In [60]:
print(classification_report(df['Score'], df['predict_Vader']))

              precision    recall  f1-score   support

           0       0.65      0.49      0.56      1000
           1       0.59      0.73      0.65      1000

    accuracy                           0.61      2000
   macro avg       0.62      0.61      0.61      2000
weighted avg       0.62      0.61      0.61      2000



## Q2 - Ensemble

In [61]:
df.head()

Unnamed: 0,Review,poscnt_BL,negcnt_BL,netcnt_BL,Score,predictBL,poscnt_LM,negcnt_LM,netcnt_LM,predictLM,score_TextBlob,predict_textblob,negscore_Vader,neuscore_Vader,posscore_Vader,compound_Vader,predict_Vader
0,"1,""films adapted from comic books have had ple...",16,36,-20,1,0,5,11,-6,0,-0.061036,0,0.138,0.802,0.06,-0.9905,0
1,"2,""every now and then a movie comes along from...",21,33,-12,1,0,7,12,-5,0,0.08839,1,0.069,0.833,0.098,0.8319,1
2,"3,""you've got mail works alot better than it d...",32,28,4,1,1,12,7,5,1,0.081941,1,0.075,0.765,0.16,0.9887,1
3,"4,"" jaws is a rare film that grabs your att...",20,41,-21,1,0,5,14,-9,0,0.066679,1,0.085,0.806,0.109,0.9373,1
4,"5,""moviemaking is a lot like being the general...",17,24,-7,1,0,1,6,-5,0,0.054987,1,0.037,0.849,0.114,0.9819,1


In [62]:
dfensemble = df.drop(['Score','predictBL','poscnt_BL','negcnt_BL','netcnt_BL','poscnt_LM','negcnt_LM','netcnt_LM','score_TextBlob','negscore_Vader','neuscore_Vader','posscore_Vader','compound_Vader'], axis=1)

In [63]:
dfensemble.head()

Unnamed: 0,Review,predictLM,predict_textblob,predict_Vader
0,"1,""films adapted from comic books have had ple...",0,0,0
1,"2,""every now and then a movie comes along from...",0,1,1
2,"3,""you've got mail works alot better than it d...",1,1,1
3,"4,"" jaws is a rare film that grabs your att...",0,1,1
4,"5,""moviemaking is a lot like being the general...",0,1,1


In [64]:
dfensemble['majority'] = dfensemble.mode(axis=1)[0]
print(dfensemble)

                                                 Review  predictLM  \
0     1,"films adapted from comic books have had ple...          0   
1     2,"every now and then a movie comes along from...          0   
2     3,"you've got mail works alot better than it d...          1   
3     4,"  jaws   is a rare film that grabs your att...          0   
4     5,"moviemaking is a lot like being the general...          0   
...                                                 ...        ...   
1995  1996,"if anything ,   stigmata   should be tak...          0   
1996  1997,"john boorman's   zardoz   is a goofy cin...          0   
1997  1998,"the kids in the hall are an acquired tas...          0   
1998  1999,"there was a time when john carpenter was...          0   
1999  2000,"two party guys bob their heads to haddaw...          0   

      predict_textblob  predict_Vader  majority  
0                    0              0         0  
1                    1              1         1  
2        

In [65]:
dfnew = pd.read_csv("MovieReview-Sample.csv",header = None)

In [66]:
dfensemble['Score'] = dfnew[2].astype(int)

In [67]:
dfensemble.head()

Unnamed: 0,Review,predictLM,predict_textblob,predict_Vader,majority,Score
0,"1,""films adapted from comic books have had ple...",0,0,0,0,1
1,"2,""every now and then a movie comes along from...",0,1,1,1,1
2,"3,""you've got mail works alot better than it d...",1,1,1,1,1
3,"4,"" jaws is a rare film that grabs your att...",0,1,1,1,1
4,"5,""moviemaking is a lot like being the general...",0,1,1,1,1


In [68]:
confusion_matrix_ensemble = pd.crosstab(dfensemble['Score'], dfensemble['majority'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix_ensemble)

Predicted    0    1
Actual             
0          532  468
1          268  732


In [69]:
Precision_e = precision_score(dfensemble['Score'], dfensemble['majority'])
Recall_e = recall_score(dfensemble['Score'], dfensemble['majority'])
F1_e = f1_score(dfensemble['Score'], dfensemble['majority'])

In [70]:
print('Precision Majority = ', round(Precision_e,3) )
print('Recall Majority = ', Recall_e )
print('F1 Majority = ', round(F1_e,3) )

Precision Majority =  0.61
Recall Majority =  0.732
F1 Majority =  0.665


In [72]:
print(classification_report(dfensemble['Score'], dfensemble['majority']))

              precision    recall  f1-score   support

           0       0.67      0.53      0.59      1000
           1       0.61      0.73      0.67      1000

    accuracy                           0.63      2000
   macro avg       0.64      0.63      0.63      2000
weighted avg       0.64      0.63      0.63      2000

