In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#Load data
path = "aclImdb/"
positiveFiles = [x for x in os.listdir(path+"train/pos/") if x.endswith(".txt")]
negativeFiles = [x for x in os.listdir(path+"train/neg/") if x.endswith(".txt")]
positive2Files = [x for x in os.listdir(path+"test/pos/") if x.endswith(".txt")]
negative2Files = [x for x in os.listdir(path+"test/neg/") if x.endswith(".txt")]

In [3]:
positiveReviews, negativeReviews, positive2Reviews, negative2Reviews = [], [], [], []
for pfile in positiveFiles:
    with open(path+"train/pos/"+pfile, encoding="latin1") as f:
        positiveReviews.append(f.read())
for nfile in negativeFiles:
    with open(path+"train/neg/"+nfile, encoding="latin1") as f:
        negativeReviews.append(f.read())
for p2file in positive2Files:
    with open(path+"test/pos/"+p2file, encoding="latin1") as f:
        positive2Reviews.append(f.read())
for n2file in negative2Files:
    with open(path+"test/neg/"+n2file, encoding="latin1") as f:
        negative2Reviews.append(f.read())

In [4]:
reviews = pd.concat([
    pd.DataFrame({"review":positiveReviews, "label":1, "file":positiveFiles}),
    pd.DataFrame({"review":negativeReviews, "label":0, "file":negativeFiles}),
    pd.DataFrame({"review":positive2Reviews, "label":1, "file":positive2Files}),
    pd.DataFrame({"review":negative2Reviews, "label":0, "file":negative2Files})
], ignore_index=True).sample(frac=1, random_state=1)
reviews.head()

Unnamed: 0,review,label,file
26247,Fame is one of the best movies I've seen about...,1,11122_8.txt
35067,This movie fully deserves to be one of the top...,1,7811_10.txt
34590,"in a time of predictable movies, in which abou...",1,7382_10.txt
16668,I saw this on TV the other nightÂ or rather I...,0,2501_1.txt
12196,I am a huge fan of Simon Pegg and have watched...,1,9728_7.txt


In [5]:
reviews["label"] = pd.to_numeric(reviews["label"])

reviews["review"] = reviews["review"].str.replace(",","")
reviews.head()

Unnamed: 0,review,label,file
26247,Fame is one of the best movies I've seen about...,1,11122_8.txt
35067,This movie fully deserves to be one of the top...,1,7811_10.txt
34590,in a time of predictable movies in which aboun...,1,7382_10.txt
16668,I saw this on TV the other nightÂ or rather I...,0,2501_1.txt
12196,I am a huge fan of Simon Pegg and have watched...,1,9728_7.txt


In [6]:
reviews.to_csv('reviews.csv')

In [2]:
reviews_df = pd.read_csv('reviews.csv')
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,review,label,file
0,26247,Fame is one of the best movies I've seen about...,1,11122_8.txt
1,35067,This movie fully deserves to be one of the top...,1,7811_10.txt
2,34590,in a time of predictable movies in which aboun...,1,7382_10.txt
3,16668,I saw this on TV the other nightÂ or rather I...,0,2501_1.txt
4,12196,I am a huge fan of Simon Pegg and have watched...,1,9728_7.txt


In [3]:
reviews_list = reviews_df["review"].tolist()

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

compound_list = []

for review in reviews_list:
    results = analyzer.polarity_scores(review)
    compound = results["compound"]
    compound_list.append(compound)

In [5]:
vader_df = pd.DataFrame({'vader':compound_list})
vader_df.head()

Unnamed: 0,vader
0,0.9618
1,0.9921
2,0.9342
3,-0.9759
4,0.9702


In [6]:
vader_df2 = pd.merge(reviews_df, vader_df, how='outer', left_index=True, right_index=True)
vader_df2.head(3)

Unnamed: 0.1,Unnamed: 0,review,label,file,vader
0,26247,Fame is one of the best movies I've seen about...,1,11122_8.txt,0.9618
1,35067,This movie fully deserves to be one of the top...,1,7811_10.txt,0.9921
2,34590,in a time of predictable movies in which aboun...,1,7382_10.txt,0.9342


In [12]:
vader_df2.to_csv('vader.csv')

In [9]:
vader_list = vader_df["vader"].tolist()

x = 1
y = 0

new_items = [1 if x >=0 else y for x in vader_list]

In [42]:
vader_df3 = pd.DataFrame({'vader_pos_neg':new_items})

vader_df4 = pd.merge(vader_df2, vader_df3, how='outer', left_index=True, right_index=True)
vader_df4

Unnamed: 0.1,Unnamed: 0,review,label,file,vader,vader_pos_neg
0,26247,Fame is one of the best movies I've seen about...,1,11122_8.txt,0.9618,1
1,35067,This movie fully deserves to be one of the top...,1,7811_10.txt,0.9921,1
2,34590,in a time of predictable movies in which aboun...,1,7382_10.txt,0.9342,1
3,16668,I saw this on TV the other nightÂ or rather I...,0,2501_1.txt,-0.9759,0
4,12196,I am a huge fan of Simon Pegg and have watched...,1,9728_7.txt,0.9702,1
5,2600,There is indeed much to complain about this mo...,1,12340_8.txt,0.9957,1
6,9047,The men can slaver over Lollo if they like (or...,1,6894_7.txt,0.9488,1
7,2206,Since it has been some years since I reviewed ...,1,11987_7.txt,0.9964,1
8,25607,I love this film it is excellent and so funny ...,1,10547_10.txt,0.9827,1
9,11606,This is actually a groovy-neat little flick ma...,1,9197_7.txt,0.7052,1


In [16]:
vader_df4.to_csv('vader.csv')

In [36]:
vader_correct = []
for x in vader_df4.itertuples():
    if (x.label == x.vader_pos_neg):
        vader_correct.append((x.file))
    else:
        continue 

In [41]:
count = len(vader_correct)
count

34708