### Convolution Neural Network

This model is implemented using CNN. Text data is converted to numeric and then word embeddings are obtained for each word using pre-trained word2vec weights.

In [9]:
#!/usr/bin/env python3

import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

In [5]:
np.random.seed(42)

# Constants for embedding matrix
max_features = 100000
maxlen = 200
embed_size = 300

In [6]:
def load_data():
    '''
    This function loads text data from csv files, converts into numeric
    values  and retunrs as train, test and labels. It also returns embedding
    matrix which stores weights of embedding layer for each word.
    '''
    print('Loading data...')
    # If your data is stored somewhere else, give 
    # appropriate path here
    train = pd.read_csv('./data/train.csv')
    test = pd.read_csv('./data/test.csv')

    X_train = train['comment_text'].fillna('fillna').values
    y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    X_test = test['comment_text'].fillna('fillna').values
    
    # Convert text to numeric sequences and pad them
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    
    data = (x_train, y_train, x_test)
        
    # Get weights for embedding layer
    embedding_matrix = get_embedding_weights(tokenizer)
    
    return data, embedding_matrix

In [7]:
# Helper function to create embedding matrix
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

In [8]:
# Get weights for the embedding layer from a pre-trained model
def get_embedding_weights(tokenizer):
    
    print('Getting weights for the embedding layer...')
    
    # Create a dictionary to store mappings of pre-trained 
    # embedding weights from the embedding file
    # Give appropriate path here
    EMBEDDING_FILE = './crawl-300d-2M.vec/crawl-300d-2M.vec'
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

    # Get the dictionary of word to index mappings for our data
    word_index = tokenizer.word_index
    # Count number of words from it
    nb_words = min(max_features, len(word_index))
    
    # Create an embedding matrix of appropriate size
    # to store weights for the words in our data
    embedding_matrix = np.zeros((nb_words, embed_size))
    
    # Iterate through each word in the data to get its
    # corresponding weight
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [7]:
def compile_model(embedding_matrix):
    
    print('Compiling model...')
    # Constant for model architecture
    filter_sizes = [1,2,3,5]
    num_filters = 32
    
    # Construct and compile the model
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), 
                    kernel_initializer='normal', activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), 
                    kernel_initializer='normal', activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), 
                    kernel_initializer='normal', activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), 
                    kernel_initializer='normal', activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(6, activation='sigmoid')(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [8]:
def run_network(data, model, batch_size=256, epochs=3):
    
    x_train, y_train, x_test = data

    print('Splitting data...')
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        x_train, y_train, train_size=0.95, random_state=233)
    
    X_train.reshape((X_train.shape[0], maxlen))
    X_val.reshape((X_val.shape[0], maxlen))

    print('Training the model...')
    # Fir the data
    hist = model.fit(X_train, y_train, 
                     batch_size=batch_size, epochs=epochs, 
                     validation_data=(X_val, y_val), verbose=1)

    print('Predicting on the unseen data...')
    # Predict
    y_pred = model.predict(x_test, batch_size=1024)
    
    # Save results
    submission = pd.read_csv('./data/sample_submission.csv')
    submission[
        ['toxic', 'severe_toxic', 'obscene', 
         'threat', 'insult', 'identity_hate']] = y_pred
    submission.to_csv('submission_cnn.csv', index=False)
    
    print('Predictions done... Results saved successfully!')
    
    return hist

In [11]:
# Get data
data, embedding_matrix = load_data()

# Compile model
model = compile_model(embedding_matrix)

# Run network
hist = run_network(data, model)

with open('./cnn_history.json', 'w') as f:
    json.dump(hist.history, f)