In [1]:
import pandas as pd
import nltk
import re

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import movie_reviews as reviews
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.text import Text

sid = SentimentIntensityAnalyzer()
word_lm = WordNetLemmatizer()



In [2]:
movie_reviews = []
for fileid in reviews.fileids():
    ss = sid.polarity_scores(reviews.raw(fileid))
    movie_reviews.append((reviews.raw(fileid), ss['neg'], ss['neu'], ss['pos'], ss['compound']))

In [3]:
df = pd.DataFrame(movie_reviews, columns=['text', 'neg', 'neu', 'pos', 'compound'])
df.head(5)

Unnamed: 0,text,neg,neu,pos,compound
0,"plot : two teen couples go to a church party ,...",0.093,0.762,0.145,0.9924
1,the happy bastard's quick movie review \ndamn ...,0.041,0.866,0.093,0.916
2,it is movies like these that make a jaded movi...,0.098,0.789,0.113,0.705
3,""" quest for camelot "" is warner bros . ' firs...",0.152,0.713,0.136,-0.8481
4,synopsis : a mentally unstable man undergoing ...,0.077,0.844,0.079,0.6824


In [4]:
df.sort_values('neg', ascending=False).head(5)

Unnamed: 0,text,neg,neu,pos,compound
955,words i thought i'd never write : the sequel t...,0.255,0.674,0.071,-0.9981
64,"rated : r for strong violence , language , dru...",0.243,0.64,0.117,-0.9989
404,can a horror movie truly be called a horror mo...,0.24,0.717,0.044,-0.9987
1651,it might surprise some to know that joel and e...,0.237,0.689,0.074,-0.9993
1747,"edward zwick's "" the siege "" raises more quest...",0.228,0.667,0.105,-0.9954


In [5]:
df.sort_values('pos', ascending=False).head(5)

Unnamed: 0,text,neg,neu,pos,compound
1144,the dramatic comedy cousins has all the necess...,0.03,0.64,0.33,0.9993
1542,don't let the following quirks of this review ...,0.046,0.629,0.326,0.9983
1517,the happy bastard's 30-second review \nnotting...,0.057,0.637,0.306,0.9966
1751,life is beautiful is a rare treat : a lighthea...,0.105,0.597,0.298,0.9989
1752,you've probably heard the one about the priest...,0.059,0.65,0.291,0.9996


In [6]:
df.sort_values('neu', ascending=False).head(5)

Unnamed: 0,text,neg,neu,pos,compound
305,walt disney studios may have finally met its m...,0.054,0.918,0.027,-0.7522
177,deserves recognition for : achieving the near-...,0.015,0.915,0.069,0.7391
1881,the word 'rest' in the title should be stresse...,0.024,0.913,0.063,0.897
1255,it is often said by his fans that hal hartley ...,0.041,0.898,0.061,0.8659
446,here is a movie that sadly follows the hong ko...,0.064,0.898,0.038,-0.8995


In [7]:
df.sort_values('compound', ascending=False).head(5)

Unnamed: 0,text,neg,neu,pos,compound
1780,note : some may consider portions of the follo...,0.038,0.676,0.286,0.9999
1509,as i write the review for the new hanks/ryan r...,0.044,0.668,0.288,0.9999
1728,playwright tom stoppard and screenwriter marc ...,0.066,0.665,0.27,0.9998
1908,i actually am a fan of the original 1961 or so...,0.054,0.733,0.213,0.9998
1163,note : some may consider portions of the follo...,0.048,0.749,0.203,0.9998


In [8]:
def pre_processing(data_frame, txt_clm):
    
    df_txt = data_frame.copy()
    df_txt['lower_txt'] = df_txt[txt_clm].str.lower()
    df_txt['punc_rm'] = df_txt['lower_txt'].str.replace('[^\w\s]','')
    df_txt['links_rm'] = df_txt['punc_rm'].apply(lambda v: re.sub(r'http\S+', '', v))
    df_txt['stopwords_rm'] = df_txt['links_rm'].apply(lambda x: " ".join(x for x in x.split() if x not in ENGLISH_STOP_WORDS))
    df_txt['lemmatize_text'] = df_txt['stopwords_rm'].apply(lambda v: " ".join([word_lm.lemmatize(i) for i in v.split()]))
    df_txt['processed_text'] = df_txt['lemmatize_text'].apply(lambda x: word_tokenize(x))
    df_txt['raw_text'] = df_txt['processed_text'].apply(lambda x: " ".join(w for w in x if len(w)> 3))
    return df_txt[[txt_clm, 'raw_text']]

In [9]:
nnp_list = []
df_sub = pre_processing(df, 'text')
sentences = df_sub['raw_text']
for sentence in sentences:
    tagged_sent = pos_tag(sentence.split())
    for word, pos in tagged_sent:
        if pos == 'NNP' and len(word):
            nnp_list.append(word)

In [10]:
fd = nltk.FreqDist(nnp_list)
for k in fd.most_common(20):
    print k[0], k[1]

keaton 29
_the 22
october 17
killer 15
kevin 14
kill 13
zane 13
milton 8
trek 8
katrina 7
schreiber 7
know 7
knock 7
_not_ 6
flubber 6
kelley 6
xavier 6
karen 6
_is_ 6
molina 6


In [11]:
text = nltk.Text(nltk.word_tokenize("\n".join(df['text'].values.tolist())))

In [12]:
for k in fd.most_common(20):
    print text.concordance(k[0])

Displaying 25 of 25 matches:
nnell stars in this remake of buster keaton 's 1925 silent film seven chances . 
as i 'm more of a chaplin fan than a keaton fan , but i seriously doubt that the
le dialogue to say . i think michael keaton is the quintessential batman but val
nd bring back catwoman . and michael keaton . but we can only hope . writers : d
ent , with the solid team of michael keaton and andy garcia unthankfully thrown 
ent sociopath peter mccabe ( michael keaton ) , currently serving a life sentenc
enes where the good guys catch up to keaton only to have him grab a hostage and 
rtment . cast against type , michael keaton 's understated menace is highly effe
essful fabric designer ellie ( diane keaton ) for twenty-five years . they and b
oes n't initiate his own sex romps ? keaton 's ellie has a trusting nature that 
` unsung heroes ' . meg ryan , diane keaton and lisa kudrow play a trio of siste
ursue a impending career . georgia ( keaton ) , the eldest daughter , is celebra