In [25]:
import nltk
from nltk.corpus import movie_reviews
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [3]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [4]:
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    reviews.append((filename, tag, movie_reviews.raw(fileid)))
df = pd.DataFrame(reviews, columns=['filename', 'tag', 'text'])
df.head()

Unnamed: 0,filename,tag,text
0,cv000_29416.txt,neg,"plot : two teen couples go to a church party ,..."
1,cv001_19502.txt,neg,the happy bastard's quick movie review \ndamn ...
2,cv002_17424.txt,neg,it is movies like these that make a jaded movi...
3,cv003_12683.txt,neg,""" quest for camelot "" is warner bros . ' firs..."
4,cv004_12641.txt,neg,synopsis : a mentally unstable man undergoing ...


In [5]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()]) #removing actors names from the data
#can also be customised to remove most frequent words used in positive and egative reviews, for a cleaner distribution of words

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\t', '', text)
    
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in unwanted]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lem_tokens)
    
    return processed_text

In [6]:
df.text[0]

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [7]:
preprocess_text(df.text[0])

'plot two teen couple go church party drink drive get accident one guy dy girlfriend continues life nightmare whats deal watch movie sorta find critique mindfuck movie teen generation touch cool idea present bad package make review even harder one write since generally applaud film attempt break mold mess head lost highway memento good bad way making type film folk didnt snag one correctly seem taken pretty neat concept executed terribly problem movie well main problem simply jumbled start normal downshift fantasy world audience member idea whats going dream character coming back dead others look like dead strange apparition disappearance looooot scene ton weird thing happen simply explained personally dont mind trying unravel film every give clue get kind fed film biggest problem obviously got big secret hide seems want hide completely final five minute make thing entertaining thrilling even engaging meantime really sad part arrow dig flick like actually figured halfway point strangen

In [8]:
df['text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,filename,tag,text
0,cv000_29416.txt,neg,plot two teen couple go church party drink dri...
1,cv001_19502.txt,neg,bastard quick movie review damn yk bug got hea...
2,cv002_17424.txt,neg,movie like make jaded movie viewer thankful in...
3,cv003_12683.txt,neg,quest camelot bros first featurelength fullyan...
4,cv004_12641.txt,neg,synopsis mentally unstable man undergoing psyc...


In [9]:
df.tag.unique()

array(['neg', 'pos'], dtype=object)

In [10]:
def label_encoding(a):
    if a == 'pos':
        return 0
    if a == 'neg':
        return 1

df.tag = df.tag.apply(label_encoding)
df.head()

Unnamed: 0,filename,tag,text
0,cv000_29416.txt,1,plot two teen couple go church party drink dri...
1,cv001_19502.txt,1,bastard quick movie review damn yk bug got hea...
2,cv002_17424.txt,1,movie like make jaded movie viewer thankful in...
3,cv003_12683.txt,1,quest camelot bros first featurelength fullyan...
4,cv004_12641.txt,1,synopsis mentally unstable man undergoing psyc...


In [11]:
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 0 if scores['pos'] >= 0 else 1
    return sentiment
df['predicted_sentiment'] = df.text.apply(get_sentiment)
def get_score(text):
    scores = analyzer.polarity_scores(text)
    score = scores['pos'] if scores['pos'] >= 0 else scores['neg']
    return score
df['sentiment_score'] = df.text.apply(get_score)

In [12]:
df

Unnamed: 0,filename,tag,text,predicted_sentiment,sentiment_score
0,cv000_29416.txt,1,plot two teen couple go church party drink dri...,0,0.208
1,cv001_19502.txt,1,bastard quick movie review damn yk bug got hea...,0,0.132
2,cv002_17424.txt,1,movie like make jaded movie viewer thankful in...,0,0.180
3,cv003_12683.txt,1,quest camelot bros first featurelength fullyan...,0,0.154
4,cv004_12641.txt,1,synopsis mentally unstable man undergoing psyc...,0,0.123
...,...,...,...,...,...
1995,cv995_21821.txt,0,wow movie everything movie funny dramatic inte...,0,0.411
1996,cv996_11592.txt,0,commanding actor he always great film everythi...,0,0.203
1997,cv997_5046.txt,0,glorystarring denzel freemanis true story th r...,0,0.177
1998,cv998_14111.txt,0,spielberg second epic film world war ii unques...,0,0.185


In [56]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df['tag'], df['predicted_sentiment'])
#This is fair, as VADER rates better on tweets, as it's trained based on tweets

array([[1000,    0],
       [1000,    0]], dtype=int64)

In [14]:
positive_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_ids + negative_ids

In [27]:
#Redefining the functions to work on an entire movie review
from statistics import mean
def is_positive(review_id: str) -> bool:
    #True if the average of all sentence compound scores is positive
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [analyzer.polarity_scores(sentence)["compound"] for sentence in nltk.sent_tokenize(text)]
    return mean(scores) > 0

In [28]:
from random import shuffle

shuffle(all_review_ids)
correct = 0

for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in positive_ids: 
            correct += 1
    else:
        if review_id in negative_ids:
            correct += 1
print(correct/len(all_review_ids))
#64% is predicted correctly. 

0.64


In [17]:
#Using Machine Learning Methods

In [36]:
#Extracting features that can be used to train the ML models, positive scores and compound scores, can also be extended to most frequent words used in a pos/neg review.
def extract_features(text):
    features = dict()
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        compound_scores.append(analyzer.polarity_scores(sentence)["compound"])
        positive_scores.append(analyzer.polarity_scores(sentence)["pos"])
        
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)

    return features

In [40]:
features = [(extract_features(t), "pos") for t in df['text'][df['tag']==0].tolist()]
features.extend([(extract_features(t), "neg") for t in df['text'][df['tag']==1].tolist()])

In [50]:
classifier = nltk.NaiveBayesClassifier.train(features)

In [52]:
print(classifier.classify(extract_features("I love the plot!")))

pos


In [43]:
nltk.classify.accuracy(classifier, features)

0.8785

In [46]:
#KNNClassifier
from sklearn.neighbors import KNeighborsClassifier
cls = nltk.classify.SklearnClassifier(KNeighborsClassifier())
cls.train(features)
nltk.classify.accuracy(cls, features)

0.762

In [47]:
#LogisticRegression Classifier
from sklearn.linear_model import LogisticRegression
cls = nltk.classify.SklearnClassifier(LogisticRegression())
cls.train(features)
nltk.classify.accuracy(cls, features)

0.6435