# Week 7.1 Convolutional Neural Networks

In [None]:
import numpy as np  # Keras takes care of most of this but it likes to see Numpy arrays
from keras.preprocessing import sequence    # A helper module to handle padding input
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation   # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D
from gensim.models.keyedvectors import KeyedVectors
from nltk.tokenize.casual import casual_tokenize
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
import pandas as pd
from random import shuffle
!git clone https://github.com/Louismac/NLP-Public
%cd NLP-Public
#Code editted from NLPIA book ch7 https://github.com/totalgood/nlpia#

# Colab users

In [None]:
#Run this cell ONLY IF you're on colab
#This is going to download a giant file into your colab distribution
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
embeddings_file = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above

## Classification 

In [None]:
#This line will load the 200,000 most common words into wv, a variable of word vectors
#Note that this will load these into memory, so if you don't have a lot of memory on your computer you may run into problems/slowness
#embeddings_file = "GoogleNews-vectors-negative300.bin.gz"
wv = KeyedVectors.load_word2vec_format(embeddings_file, binary=True, limit=200000)

In [None]:
word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=True, limit=200000)
def tokenize_and_vectorize(dataset):
    vectorized_data = []
    for sample in dataset:
        tokens = casual_tokenize(sample)
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass  # No matching token in the Google w2v vocab
        vectorized_data.append(sample_vecs)

    return vectorized_data

def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
 
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

### Dataset

In [None]:
file_name = "galleryorrecordlabel"
dataset = pd.read_csv("data/" + file_name + ".tsv",sep = "\t")
# shuffle the DataFrame rows 
dataset = dataset.sample(frac = 1) 
features = tokenize_and_vectorize(dataset["text"])
x_train, x_test, y_train, y_test = train_test_split(features, dataset["label"], test_size=0.3, random_state=0)

In [None]:
maxlen = 50
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet

In [None]:
np.array(x_train).shape,np.array(x_test).shape

In [None]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
np.array(x_train).shape,np.array(x_test).shape

In [None]:
batch_size = 32        # How many samples to show the net before backpropogating the error and updating the weights
filters = 1           # Number of filters we will train
kernel_size = 3         # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
hidden_dims = 10       # Number of neurons in the plain feed forward net at the end of the chain
epochs = 5    

In [None]:
print('Build model...')
model = Sequential()

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("cnn_weights.h5")
print('Model saved.')