In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import string
import numpy as np
import gensim
import bz2
import keras
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../data/WEEK_2_DESCRIPTIONS.csv').drop('Unnamed: 0', axis=1)

In [3]:
def bin_scores(score):
    if score > 60:
        if score > 75:
            return 2
        return 1
    return 0

df['success'] = df['Critic_Score'].apply(bin_scores)

def clean_text(text):
    
    # Removes punctuation
    words = [''.join(ch for ch in s if ch not in string.punctuation)\
             for s in text.split()]
    
    # Returns the lower-case string
    return ' '.join(words).lower()

df.plots = df.plots.apply(clean_text)

In [4]:
tokenizer = RegexpTokenizer(r'\w+')
df['tokens'] = df.plots.apply(tokenizer.tokenize)

In [5]:
word2vec_path = '../data/enwiki_20180420_win10_500d.txt.bz2'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, limit=1000000)

In [6]:

def get_all_word2vec(tokens_list, vector, generate_missing=False, k=300, sent_length=100):
    imputed = 0
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
        
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    for _ in range(sent_length - len(tokens_list)):
        vectorized.append(np.zeros(k))
    return np.array(vectorized[:sent_length])

def get_word2vec_embeddings(vectors, clean_questions, generate_missing=False, k=300, sent_length=100):
    embeddings = df['tokens'].apply(lambda x: get_all_word2vec(x, vectors, generate_missing=generate_missing, k=k, sent_length=sent_length))
    return list(embeddings)

In [7]:
SENTENCE_LENGTH = 50

embeddings = get_word2vec_embeddings(word2vec, df, True, 500, SENTENCE_LENGTH)

X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(embeddings, df.success, 
                                                                                        test_size=0.2, random_state=40)



X_train_cnn =np.array(X_train_word2vec).reshape((-1, SENTENCE_LENGTH, 500, 1))
X_test_cnn = np.array(X_test_word2vec).reshape((-1, SENTENCE_LENGTH, 500, 1))
y_train_cnn = to_categorical(y_train_word2vec)
y_test_cnn = to_categorical(y_test_word2vec)


In [8]:
def build_kum_cnn_graph(sent_len, word_vec, out_dim, filters = 64, n_grams = [2,3], num_dense_layers = 3):
    '''
    args:
            sent_len: length of input sentense. if raw sentense is less than it, using zero padding, else cut down to it.
        word_vec: dim of word vector embedding using pre-trained glove or word2vec model
        out_dim: dim of output y
        filters: filters for Convolutional layers
        n_grams: list of ngram for Convolutional layers kernal. each will generate one cell output. details can be referred from paper
        num_dense_layers: to decide how many dense layers after concatenating all Convolutional layers output
    returns:
        Keras Model
    '''
    inputs = keras.layers.Input(shape=(sent_len, word_vec, 1))
    merged_layer = []
    for h in n_grams:
        conv_layer = keras.layers.Conv2D(filters, (h, word_vec), activation='relu')(inputs)
        pool_layer = keras.layers.MaxPooling2D(pool_size=(sent_len-h+1, 1))(conv_layer)
        merged_layer.append(pool_layer)
    concat_layer = keras.layers.concatenate(merged_layer)
    flatten_layer = keras.layers.Flatten()(concat_layer)
    in_ = flatten_layer
    prev_units = filters * len(n_grams)
    for _ in range(num_dense_layers - 1):
        prev_units /= 2
        dense_layer = keras.layers.Dense(int(prev_units), 
                        activation='relu', 
                        kernel_regularizer = keras.regularizers.l2(0.01),
                        # activity_regularizer = keras.regularizers.l1(0)
                                        )(in_)
        drop_layer = keras.layers.Dropout(.5)(dense_layer)
        in_ = drop_layer
        
    outputs = keras.layers.Dense(out_dim, activation = 'softmax')(in_)
    
    model = keras.models.Model(inputs = inputs, outputs = outputs)
#     model.summary()
    return model

In [9]:
model = build_kum_cnn_graph(SENTENCE_LENGTH, 500, 3, n_grams=[1,2,3,4,5])
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])

model.fit(X_train_cnn, y_train_cnn, epochs=20,
          batch_size=100, validation_data=(X_test_cnn, y_test_cnn))

Train on 4399 samples, validate on 1100 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x266b0b66f60>

In [10]:
# model = build_kum_cnn_graph(100, 500, 3, n_grams=[2,3,4])


# model.compile(loss='categorical_crossentropy',
#                   optimizer='adam',
#                   metrics=['acc'])

# model.fit(X_train_cnn, y_train_cnn, epochs=20,
#           batch_size=40, validation_split=.2)