# Bag of Words Meets Bags of Popcorn

[Kaggle Chanllenge](https://www.kaggle.com/c/word2vec-nlp-tutorial)
Use Google's Word2Vec for movie reviews

Deadline: 2019/01/05

In [26]:
import numpy as np
import pandas as pd
import keras

import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec

import re
import codecs
import matplotlib.pyplot as plt

In [39]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping

## Import Cleaned Data

In [14]:
train = pd.read_csv( "Data/Word2Vec_clean_labeledTrainData.csv", index_col=0 )

unlabeled_train = pd.read_csv( "Data/Word2Vec_clean_unlabeledTrainData.csv", index_col=0 )

test = pd.read_csv( "Data/Word2Vec_clean_testData.csv", index_col=0 )

print("Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews.\n" 
      % (train["review"].size, 
         test["review"].size, 
         unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 25000 unlabeled reviews.



In [32]:
test.head()

Unnamed: 0,review,id,score
0,naturally in a film who s main themes are of m...,"""12311_10""",0
1,this movie is a disaster within a disaster fil...,"""8348_2""",2
2,all in all this is a movie for kids we saw it ...,"""5828_4""",4
3,afraid of the dark left me with the impression...,"""7186_2""",2
4,a very accurate depiction of small time mob li...,"""12128_7""",7


In [19]:
EMBEDDING_DIM = 300        # size of each word vector
MAX_VOCAB_SIZE = 200000    # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 500  # max number of words in a review

### Vectorization

We vectorize the text corpus by turning each text into a **sequence of integers**. Each integer is the index of a token in the dictionary

In [23]:
keras_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, char_level=False)

keras_tokenizer.fit_on_texts(train['review'].tolist() + test['review'].tolist())
word_index = keras_tokenizer.word_index

print('Keras Tokenizer found %s unique tokens' % len(word_index))

# Transform each text to a sequence of integers.
train_sequences = keras_tokenizer.texts_to_sequences(train['review'].tolist())
test_sequences = keras_tokenizer.texts_to_sequences(test['review'].tolist())

Keras Tokenizer found 101398 unique tokens


In [24]:
# Pad to the same length.

train_pad_sequences = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_pad_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [28]:

# Word embedding matrix
word2vec_model = Word2Vec.load("features300-minwords40-context10")

# Choose the smaller number one as column length
num_words = min(MAX_VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, index in word_index.items():
    if word in word2vec_model.wv.vocab:
        embedding_matrix[index] = word2vec_model.wv.get_vector(word)
        
# Null word embeddings: words don't exist in the embedding matrix and are represented as zero vectors.
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 84942


In [30]:
# Split data
VALIDATION_SPLIT = 0.2
np.random.seed(1234)

perm = np.random.permutation(len(train_sequences))
index_train = perm[:int(len(train_sequences)*(1-VALIDATION_SPLIT))]
index_val = perm[int(len(train_sequences)*(1-VALIDATION_SPLIT)):]

x_train = train_pad_sequences[index_train]
x_val = train_pad_sequences[index_val]
y_train = train["sentiment"][index_train].tolist()
y_val = train["sentiment"][index_val].tolist()

print('Randomly split %d pad sequences for training, %d for validation' % (len(x_train) ,len(x_val)))

Randomly split 20000 pad sequences for training, 5000 for validation


In [33]:
x_test = test_pad_sequences

## Network Architecture

In [43]:
structure_test = Sequential()

e = Embedding(100000, 200, input_length=45)

structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.add(GlobalMaxPooling1D())
structure_test.add(Dense(256, activation='relu'))
structure_test.add(Dense(1, activation='sigmoid'))

structure_test.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 45, 200)           20000000  
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 44, 100)           40100     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               25856     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 20,066,213
Trainable params: 20,066,213
Non-trainable params: 0
_________________________________________________________________


In [37]:
def layer1_cnn(dropout=0.2, num_filters=64, kernel_size=2):
    model = Sequential()

    embedding_layer = Embedding(
            num_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)
    output_layer = Dense(1, activation='sigmoid')
    
    model.add(embedding_layer)
    model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout))
    model.add(output_layer)
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [40]:
layer1_cnn_model = layer1_cnn()
layer1_cnn_model.summary()

UnboundLocalError: local variable 'a' referenced before assignment

### Check Model Accuracy

In [None]:
csv = pd.read_csv("Results/Bag_of_Words_model_feature3k_bigram.csv")

In [None]:
def classified_correct(model, i, inside=False):
    if inside:
        true_positive = model["sentiment"][i] == 0 and int(model["id"][i].split('"')[1].split("_")[1]) <= 5
        true_negative = model["sentiment"][i] == 1 and int(model["id"][i].split('"')[1].split("_")[1]) > 5
    
    else:
        true_positive = model["sentiment"][i] == 0 and int(model["id"][i].split("_")[1]) <= 5
        true_negative = model["sentiment"][i] == 1 and int(model["id"][i].split("_")[1]) > 5
    
    return true_positive or true_negative

target = output
correct = np.array([classified_correct(target, i, inside=True) for i in range(target.shape[0])])
print(correct.sum() / target.shape[0])