### Long Short Term Memory - Covolutional Neural Network

This model is a combination of LSTM-CNN. LSTM is implemented with GRU layer in Keras and 1D convolution layer is built on top of it.

In [2]:
#!/usr/bin/env python3

import os
import numpy as np 
import pandas as pd 
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing import text, sequence
from keras.layers import Dense, Input, Bidirectional, Conv1D, GRU
from keras.layers import SpatialDropout1D, Embedding, concatenate
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'

In [2]:
max_features = 100000
maxlen = 150
embed_size = 300

In [5]:
def load_data():
    '''
    This function loads text data from csv files, converts into numeric
    values  and retunrs as train, test and labels. It also returns embedding
    matrix which stores weights of embedding layer for each word.
    '''
    print('Loading data...')
    # If your data is stored somewhere else, give 
    # appropriate path here
    train = pd.read_csv('./data/train.csv')
    test = pd.read_csv('./data/test.csv')

    train["comment_text"].fillna("fillna")
    test["comment_text"].fillna("fillna")
    X_train = train["comment_text"].str.lower()
    X_test = test["comment_text"].str.lower()
    y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
    
    # Convert text to numeric sequences and pad them
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    
    data = (x_train, y_train, x_test)
        
    # Get weights for embedding layer
    embedding_matrix = get_embedding_weights(tokenizer)
    
    return data, embedding_matrix

In [6]:
# Helper function to create embedding matrix
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

In [9]:
# Get weights for the embedding layer from a pre-trained model
def get_embedding_weights(tokenizer):
    
    print('Getting weights for the embedding layer...')
    
    # Create a dictionary to store mappings of pre-trained 
    # embedding weights from the embedding file
    # Give appropriate path here
    EMBEDDING_FILE = './crawl-300d-2M.vec'
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

    # Get the dictionary of word to index mappings for our data
    word_index = tokenizer.word_index
    # Count number of words from it
    nb_words = min(max_features, len(word_index))
    
    # Create an embedding matrix of appropriate size
    # to store weights for the words in our data
    embedding_matrix = np.zeros((nb_words, embed_size))
    
    # Iterate through each word in the data to get its
    # corresponding weight
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [11]:
def compile_model(embedding_matrix):
    
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, 
                  embed_size, 
                  weights=[embedding_matrix], 
                  trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    preds = Dense(6, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    
    return model

In [12]:
def run_network(data, model, batch_size=128, epochs=3):
    
    x_train, y_train, x_test = data

    print('Splitting data...')
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        x_train, y_train, train_size=0.90, random_state=233)
    
    X_train.reshape((X_train.shape[0], maxlen))
    X_val.reshape((X_val.shape[0], maxlen))


    print('Training the model...')
    # Fir the data
    hist = model.fit(X_train, y_train, 
                     batch_size=batch_size, epochs=epochs, 
                     validation_data=(X_val, y_val), verbose=1)

    print('Predicting on the unseen data...')
    # Predict
    y_pred = model.predict(x_test, batch_size=1024)
    
    # Save results
    submission = pd.read_csv('./data/sample_submission.csv')
    submission[
        ['toxic', 'severe_toxic', 'obscene', 
         'threat', 'insult', 'identity_hate']] = y_pred
    submission.to_csv('submission_lstm-cnn.csv', index=False)
    
    print('Predictions done... Results saved successfully!')
    
    return hist

In [1]:
# Get data
data, embedding_matrix = load_data()

# Compile model
model = compile_model(embedding_matrix)

# Run network
hist = run_network(data, model)

with open('./lstm-cnn_history.json', 'w') as f:
    json.dump(hist.history, f)