## Reading Data

In [9]:
import re
import pandas as pd
 
df = pd.read_csv("../data/imdb_master.csv", encoding='latin-1', index_col = 0)

df = df.drop(['type','file'],axis=1)
df.columns = ["review","sentiment"]
df.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,neg
1,This is an example of why the majority of acti...,neg
2,"First of all I hate those moronic rappers, who...",neg
3,Not even the Beatles could write songs everyon...,neg
4,Brass pictures (movies is not a fitting word f...,neg


In [10]:
df = df[df.sentiment != 'unsup']
df['sentiment'] = df['sentiment'].map({'pos': 1, 'neg': 0})
df.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [11]:
def clean_review(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
 
    # Strip escaped quotes
    text = text.replace('\\"', '')
 
    # Strip quotes
    text = text.replace('"', '')
 
    return text

df.review = df.review.apply(lambda x: clean_review(x))

## Preparing text

In [12]:
%%time
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df['cleaned_review_nltk'] = df.review.apply(lambda x: clean_text(x))

# Check out how the cleaned review compares to the original one
print(df['review'][10])
print("\n\n")
print(df['cleaned_review_nltk'][10])

Cage plays a drunk and gets high critically praise. Elizabeth Shue Actually has to do a love seen with the most unattractive and overrated piece of dung flesh in Hollywood. I literally vomited while watching this film. Of course I had the flu, but that does not mean this film did not contribute to the vomit in the kamode.   Why can't Nick Cage play something he can really pull off like a bad actor. Nick Cage who be brilliant in a role as a bad actor. Heck nobody could do it better.  The search begins for Nick's contract with Lucifer or was it Lou Cipher from Night Train To Terror.



cage play drink get high critically praise elizabeth shue actually ha love see unattractive overrate piece dung flesh hollywood literally vomit watch film course flu doe mean film contribute vomit kamode   cant nick cage play something really pull like bad actor nick cage brilliant role bad actor heck nobody could better  search begin nick contract lucifer wa lou cipher night train terror
CPU times: user 4

In [13]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review_nltk
0,Once again Mr. Costner has dragged out a movie...,0,mr costner ha drag movie far longer necessary ...
1,This is an example of why the majority of acti...,0,example majority action film generic bore real...
2,"First of all I hate those moronic rappers, who...",0,first hate moronic rapper couldnt act gun pres...
3,Not even the Beatles could write songs everyon...,0,even beatles could write song everyone like al...
4,Brass pictures (movies is not a fitting word f...,0,brass picture movie fit word really somewhat b...


In [14]:
%%time
import spacy

nlp = spacy.blank('en')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

def clean_text_spacy(text):
    doc = nlp(text)
    tokens = [token for token in doc if token.is_punct == False]
    tokens = [token for token in tokens if token.text not in spacy_stopwords]
    tokens = [token.lemma_.lower() for token in tokens]
    return " ".join(tokens)
    return tokens

df['cleaned_review_spacy'] = df.review.apply(lambda x: clean_text_spacy(x))

CPU times: user 50.7 s, sys: 128 ms, total: 50.9 s
Wall time: 50.9 s


In [15]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review_nltk,cleaned_review_spacy
0,Once again Mr. Costner has dragged out a movie...,0,mr costner ha drag movie far longer necessary ...,once mr. costner drag movie far long necessary...
1,This is an example of why the majority of acti...,0,example majority action film generic bore real...,this example majority action film generic bore...
2,"First of all I hate those moronic rappers, who...",0,first hate moronic rapper couldnt act gun pres...,first i hate moronic rappers could'nt act gun ...
3,Not even the Beatles could write songs everyon...,0,even beatles could write song everyone like al...,not beatles write song like walter hill mop 2 ...
4,Brass pictures (movies is not a fitting word f...,0,brass picture movie fit word really somewhat b...,brass picture movie fit word somewhat brassy t...


## Bag of Words

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
 
# Shuffle the data and then split it, keeping 20% aside for testing
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review_nltk'], df['sentiment'], test_size=0.2)
 
# we prepare the vectorizer
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
%%time
# get a simple baseline with Logistic Regression
classifier = LogisticRegression()
classifier.fit(vectorizer.transform(X_train), y_train)
 
print("Score:", classifier.score(vectorizer.transform(X_test), y_test)) 



Score: 0.8827
CPU times: user 7.62 s, sys: 28 ms, total: 7.65 s
Wall time: 7.65 s


## TFIDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
# we prepare the vectorizer
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
%%time
# get a simple baseline with Logistic Regression
classifier = LogisticRegression()
classifier.fit(vectorizer.transform(X_train), y_train)
 
print("Score:", classifier.score(vectorizer.transform(X_test), y_test)) 

Score: 0.8964
CPU times: user 3.78 s, sys: 18 µs, total: 3.78 s
Wall time: 3.78 s


## Bag of Words with Spacy cleanup

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
 
# Shuffle the data and then split it, keeping 20% aside for testing
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review_spacy'], df['sentiment'], test_size=0.2)
 
# we prepare the vectorizer
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
%%time
# get a simple baseline with Logistic Regression
classifier = LogisticRegression()
classifier.fit(vectorizer.transform(X_train), y_train)
 
print("Score:", classifier.score(vectorizer.transform(X_test), y_test)) 

Score: 0.8833
CPU times: user 7.64 s, sys: 0 ns, total: 7.64 s
Wall time: 7.64 s


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# we prepare the vectorizer
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [23]:
%%time
# get a simple baseline with Logistic Regression
classifier = LogisticRegression()
classifier.fit(vectorizer.transform(X_train), y_train)
 
print("Score:", classifier.score(vectorizer.transform(X_test), y_test)) 

Score: 0.893
CPU times: user 3.36 s, sys: 0 ns, total: 3.36 s
Wall time: 3.36 s


## Deep Learning

In [24]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['cleaned_review_nltk'])
list_tokenized_train = tokenizer.texts_to_sequences(df['cleaned_review_nltk'])

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = df['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 3
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Using TensorFlow backend.


Train on 40000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb2ed332160>