In [1]:
import nltk
from nltk import FreqDist
#nltk.download('stopwords') # run this one time

In [2]:
import pandas as pd
pd.set_option("display.max_colwidth", 200)
import numpy as np
import re
import spacy

import gensim
from gensim import corpora


unable to import 'smart_open.gcs', disabling that module


In [3]:
df = pd.read_csv('/Users/chenzichu/Desktop/NLP/NLP project/data/RT_cleaned.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,0,3,"A distinctly gallows take on contemporary financial mores, as one absurdly rich man's limo ride across town for a haircut functions as a state-of-the-nation discourse.",3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,7,3,Cronenberg is not a director to be daunted by a scenario in which the antihero spends most of his time in a stretch limo. Turning it into a film that interests anyone ... is another matter,2/5,rotten,Matt Kelemen,0,Las Vegas CityLife,"April 21, 2013"
2,15,3,"For better or worse - often both - Cosmopolis is a quintessential David Cronenberg film. Cosmopolis is simultaneously fascinating and impenetrable, profound and absurd, labyrinthine yet intimate.",3/5,fresh,Adam Ross,0,The Aristocrat,"September 27, 2012"
3,16,3,"For one of the smartest films I've seen in a while, Cosmopolis is also one of the least outwardly enjoyable. That by no means makes it anything less than a great film however.",4/5,fresh,Patrick Kolan,0,Shotgun Cinema,"September 26, 2012"
4,23,3,Those who said Don DeLillo's book was unfilmable were wrong. This is a film. That much is undeniable. Whether it's a compelling one is a whole other question.,2/5,rotten,Mike Scott,0,Times-Picayune,"September 7, 2012"


# Data Cleaning

In [5]:
# remove unwanted characters, numbers and symbols
df['review'] = df['review'].str.replace("[^a-zA-Z#]", " ")

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words += ['movie', 'film', 'good', 'great', 'review', 'just', 'like', 'enjoy', 'best', 
                   'wa', 'hi', 'ha', 'movi']

In [10]:
# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

In [11]:
# remove short words (length < 3)
df['review'] = df['review'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [12]:
# remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in df['review']]

In [13]:
# make entire text lowercase
reviews = [r.lower() for r in reviews]

In [15]:
!python -m spacy download en # one time run

/Users/chenzichu/anaconda3/anaconda3/anaconda3/bin/python: No module named spacy


In [14]:
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, tags=['NOUN', 'ADJ']): # filter noun and adjective
       output = []
       for sent in texts:
             doc = nlp(" ".join(sent)) 
             output.append([token.lemma_ for token in doc if token.pos_ in tags])
       return output

In [15]:
tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split())
print(tokenized_reviews[1])

['cronenberg', 'director', 'daunted', 'scenario', 'antihero', 'spends', 'time', 'stretch', 'limo', 'turning', 'interests', 'anyone', 'another', 'matter']


In [16]:
reviews_2 = lemmatization(tokenized_reviews)
print(reviews_2[1]) # print lemmatized review

['director', 'scenario', 'antihero', 'time', 'stretch', 'limo', 'interest', 'anyone', 'matter']


# Model

In [22]:
dictionary = corpora.Dictionary(reviews_2)

In [23]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews_2]

In [26]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=7, random_state=100,
                chunksize=1000, passes=50)

In [27]:
lda_model.print_topics()

[(0,
  '0.011*"classic" + 0.010*"comedy" + 0.010*"-PRON-" + 0.009*"performance" + 0.009*"sweet" + 0.007*"funny" + 0.007*"form" + 0.006*"music" + 0.006*"year" + 0.006*"top"'),
 (1,
  '0.011*"big" + 0.011*"family" + 0.010*"funny" + 0.010*"comedy" + 0.008*"story" + 0.008*"lot" + 0.007*"drama" + 0.007*"child" + 0.007*"screen" + 0.006*"part"'),
 (2,
  '0.015*"old" + 0.012*"character" + 0.010*"year" + 0.010*"plot" + 0.007*"game" + 0.006*"much" + 0.006*"comedy" + 0.006*"new" + 0.006*"story" + 0.006*"level"'),
 (3,
  '0.012*"original" + 0.011*"fan" + 0.009*"sci" + 0.008*"sequel" + 0.007*"enough" + 0.007*"effect" + 0.007*"solid" + 0.007*"special" + 0.006*"star" + 0.006*"time"'),
 (4,
  '0.017*"bad" + 0.012*"action" + 0.011*"fun" + 0.010*"movie" + 0.009*"thriller" + 0.009*"bond" + 0.008*"good" + 0.007*"book" + 0.007*"summer" + 0.006*"time"'),
 (5,
  '0.012*"love" + 0.012*"performance" + 0.010*"story" + 0.009*"heart" + 0.008*"right" + 0.008*"drama" + 0.006*"film" + 0.006*"man" + 0.006*"war" + 0.0

# Deal with a paragraph

In [29]:
a_review = 'This film works so well because it takes place in an underworld in which we are so embedded that we do not even observe it. Coppola puts us straight in the smack-dab center of what is, admittedly, a society made by criminals for criminals. It is also the reason why it is so welcoming. We are surrounded by its inhabitants--cold-blooded murderers, men who see crime like a 9 to 5 job masquerading as honorable men. And I do mean men. From the outside, we would only witness the horrifying, disturbing manifestations of their well-thought out actions.But it goes even deeper than that. It all revolves around the Corleone family led by Don Vito Corleone (Marlon Brando). He is the most honest of these men, sitting right on the edge. But for people like him, who do not fully embrace this world, it is not easy. He avoids conflict until it is absolutely necessary. He is a man defined by moral principles. There is a scene at the beginning, in which, during his daughters wedding day, one of his associates, Luca Brasi (Lenny Montana) practices his speech that he is going to give to the Don when he meets him. The scene with these two is funny and almost adorable. I could not help but sympathize both of them only to realize that I am feeling warmth for two mobsters. Not to even mention that Lenny Montana was an actual mob hit-man and that he was actually nervous as he said that line.'

In [30]:
# split the paragraph into sentences
sentences = nltk.tokenize.sent_tokenize(a_review)

## Data cleaning

In [42]:
# data cleaning
df = pd.DataFrame(sentences, columns = ['review'])
# remove unwanted characters, numbers and symbols
df['review'].str.replace("[^a-zA-Z#]", " ")
# remove short words (length < 3)
df['review'] = df['review'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
# remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in df['review']]
# make entire text lowercase
reviews = [r.lower() for r in reviews]
# tokenize and filter noun and adjective
tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split())
reviews_2 = lemmatization(tokenized_reviews)

## Model

In [43]:
dictionary = corpora.Dictionary(reviews_2)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews_2]

In [44]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=7, random_state=100,
                chunksize=1000, passes=50)

In [51]:
lda_model

LdaModel(num_terms=57, num_topics=7, decay=0.5, chunksize=1000)
