# **Movie Reception Prediction with LDA**
# -------------------------------------------------------
# Part 1: Preprocessing

### Reading in Reviews Data

In [18]:
%config IPCompleter.greedy=True

In [19]:
import pandas as pd

In [20]:
pd.set_option('max_colwidth',200)

In [21]:
import pickle

In [22]:
#IMDB Reviews DF
reviews_df = pd.read_csv('IMDB_Reviews.csv')
reviews_df.head()

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire p...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue i...",positive
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenl...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what mone...",positive


In [23]:
#Save Reviews DF
reviews_df.to_pickle('pickled/reviews_df.pkl')

In [24]:
reviews_df.tail()

Unnamed: 0,review,sentiment
49995,"I thought this movie did a down right good job. It wasn't as creative or original as the first, but who was expecting it to be. It was a whole lotta fun. the more i think about it the more i like ...",positive
49996,"Bad plot, bad dialogue, bad acting, idiotic directing, the annoying porn groove soundtrack that ran continually over the overacted script, and a crappy copy of the VHS cannot be redeemed by consum...",negative
49997,"I am a Catholic taught in parochial elementary schools by nuns, taught by Jesuit priests in high school & college. I am still a practicing Catholic but would not be considered a ""good Catholic"" in...",negative
49998,"I'm going to have to disagree with the previous comment and side with Maltin on this one. This is a second rate, excessively vicious Western that creaks and groans trying to put across its central...",negative
49999,"No one expects the Star Trek movies to be high art, but the fans do expect a movie that is as good as some of the best episodes. Unfortunately, this movie had a muddled, implausible plot that just...",negative


In [25]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [26]:
review_texts = pd.DataFrame(reviews_df['review'])
review_texts.head()

Unnamed: 0,review
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me..."
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire p..."
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue i..."
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenl...
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what mone..."


In [27]:
review_texts.to_pickle('pickled/review_texts.pkl')

### Preprocessing

In [28]:
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer

In [29]:
#Stop Words
stop_words = list(stopwords.words('english'))
stop_words.append('film')
stop_words.append('movie')
stop_words.append('episode')
stop_words = set(stop_words)

In [30]:
#Stemmer
stemmer = SnowballStemmer("english")

In [31]:
#Lemmatizer
lemmatizer = WordNetLemmatizer()

In [32]:
#Preprocess Functions
def tagText(text):
    return nltk.pos_tag(text)

def getPOS(token_tag):
    if token_tag.startswith('J'):
        return wordnet.ADJ
    elif token_tag.startswith('V'):
        return wordnet.VERB
    elif token_tag.startswith('N'):
        return wordnet.NOUN
    elif token_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
    

def lemAndStem(token, POS):
    POS = getPOS(POS)
    if POS:
         return stemmer.stem(lemmatizer.lemmatize(token, pos=POS))
    else:
        return stemmer.stem(lemmatizer.lemmatize(token))
        
        
       
   

def preprocess(review):
    processed_review = []
    
    for token in simple_preprocess(review):
        if token not in stop_words and len(token)> 3:
            processed_review.append(token)
    
    tagged_review = tagText(processed_review)

    ready_review = []
    for word_set in tagged_review:
        ready_review.append(lemAndStem(word_set[0], word_set[1]))

    
   
    return ready_review
    
   

In [33]:
#Test Preprocessing: Sample Before
sample_text=review_texts.values[0][0]
sample_text


"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [34]:
sample_result = preprocess(sample_text)
print(sample_result)
print(len(sample_result))

['review', 'mention', 'watch', 'hook', 'right', 'exact', 'happen', 'first', 'thing', 'struck', 'brutal', 'unflinch', 'scene', 'violenc', 'right', 'word', 'trust', 'show', 'faint', 'heart', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'violenc', 'hardcor', 'classic', 'word', 'call', 'nicknam', 'give', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'focus', 'main', 'emerald', 'citi', 'experiment', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inward', 'privaci', 'high', 'agenda', 'citi', 'home', 'mani', 'aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'scuffl', 'death', 'stare', 'dodgi', 'deal', 'shadi', 'agreement', 'never', 'away', 'would', 'main', 'appeal', 'show', 'fact', 'go', 'show', 'dare', 'forget', 'pretti', 'pictur', 'paint', 'mainstream', 'audienc', 'forget', 'charm', 'forget', 'romanc', 'mess', 'around', 'first', 'ever', 'strike', 'nasti', 'surreal', 'readi', 'watch', 'develop', 'tast', 'accustom', 'high', 'level', 'graphic', 'v

In [35]:
#Preprocess Reviews
processed_reviews = review_texts['review'].map(preprocess)
processed_reviews[:10]

0    [review, mention, watch, hook, right, exact, happen, first, thing, struck, brutal, unflinch, scene, violenc, right, word, trust, show, faint, heart, timid, show, pull, punch, regard, drug, violenc...
1    [wonder, littl, product, film, techniqu, unassum, time, fashion, give, comfort, sometim, discomfort, sens, realism, entir, piec, actor, extrem, well, choos, michael, sheen, polari, voic, truli, se...
2    [think, wonder, spend, time, summer, weekend, sit, condit, theater, watch, light, heart, comedi, plot, simplist, dialogu, witti, charact, likabl, even, well, bread, suspect, serial, killer, disapp...
3    [basic, famili, littl, jake, think, zombi, closet, parent, fight, time, slow, soap, opera, sudden, jake, decid, becom, rambo, kill, zombi, first, go, make, must, decid, thriller, drama, drama, wat...
4    [petter, mattei, love, time, money, visual, stun, watch, mattei, offer, vivid, portrait, human, relat, seem, tell, money, power, success, peopl, differ, situat, encount, varia

In [36]:
processed_reviews.to_pickle('pickled/processed_reviews.pkl')