In this notebook I'm trying to compare two approaches for capturing intricate features in quora question similarity problem. 



In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import Image

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))

import numpy as np
import multiprocessing as mp
import tensorflow as tf
from sklearn.metrics import roc_auc_score

import string
import spacy 
import en_core_web_sm
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

from sklearn.random_projection import sparse_random_matrix
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, BatchNormalization, GRU,LSTM, Dense, Concatenate,Bidirectional, Conv1D, AveragePooling1D,\
                                                    Lambda, CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D,\
                                                    SpatialDropout1D, Dropout, add, concatenate 

from keras.models import Model
from keras.metrics import binary_accuracy
from keras.utils import plot_model
from keras import backend as K
from sklearn.model_selection import train_test_split

# Any results you write to the current directory are saved as output.

['glove-global-vectors-for-word-representation', 'submission-bert-on-quora', 'quora-question-pairs']


Using TensorFlow backend.


In [2]:
train_data = pd.read_csv("../input/quora-question-pairs/train.csv")
test_data = pd.read_csv("../input/quora-question-pairs/test.csv")
embedding_dim = 200
embedding_file_path = "../input/glove-global-vectors-for-word-representation/glove.6B."+str(embedding_dim)+"d.txt"
train_data, val_data, train_y, val_y = train_test_split(train_data[['question1', 'question2']], train_data['is_duplicate'], test_size = 0.2 )

In [3]:
maxlen = 20
vocab_size = 10000

# Preprocessing pipeline

In [4]:
#This is very generic list of contractions and most of the words may not even appear in an item description
contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not",
                "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
                "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
                "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", 
                "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                "mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
                "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
                "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
                "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", 
                "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", 
                "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", 
                "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", 
                "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
                "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", 
                "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", 
                "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
                "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                "you're": "you are", "you've": "you have" }


In [5]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 contractions={},
                 stop_words={},
                 spellings={},
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. contractions
            3. Punctuation removal
            4. Stop words removal - words like not are excluded from stop words
        """
       
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs
        self.contractions = contractions
        self.stop_words = stop_words
        self.spellings = spellings
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)   # split data for parallel processing
        pool = mp.Pool(cores)                           # create pools
        data = pd.concat(pool.map(self._preprocess_part, data_split))   # concatenate results
        pool.close()                                  
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        lowercase_text = self._lowercase(text)
        expanded_contractions = self._expand_contactions(lowercase_text)
        #removed_punct = self._remove_punct(expanded_contractions)
        #removed_stop_words = self._remove_stop_words(removed_punct)
        return (expanded_contractions)
   
    def _lowercase(self, text):
        return text.lower()
        
    def _expand_contactions(self, doc):
        new_text = ""
        for t in doc.split():
            if (t in string.punctuation) or (t in self.stop_words) or (t in string.punctuation):
                continue
            if t in contractions:
                new_text = new_text + " " + (contractions[t])
            else: 
                new_text = new_text + " " + t
        return new_text
    
    
#     def _normalize(self, text):
#         # some issues in normalise package
#         try:
#             return ' '.join(normalise(text, user_abbrevs=self.user_abbrevs, verbose=True))
#         except:
#             return text

#     def _remove_punct(self, doc):
#         return ' '.join([t for t in doc.split() if t not in string.punctuation])

#     def _remove_stop_words(self, doc):
#         return ' '.join([t for t in doc.split() if t not in self.stop_words])

In [6]:
refined_stop_words = stop_words.ENGLISH_STOP_WORDS - {"not", "none", "nothing", "nowhere", "never", "cannot",
                                "cant", "couldnt", "except", "hasnt", "neither", "no", 
                                 "nobody", "nor", "without", "when", "why","whom","who","what","where","how"}

In [7]:
%time

train_data["question1"] = train_data["question1"].fillna("None")
train_data["question2"] = train_data["question2"].fillna("None")
val_data["question1"] = val_data["question1"].fillna("None")
val_data["question2"] = val_data["question2"].fillna("None")
test_data["question1"] = test_data["question1"].fillna("None")
test_data["question2"] = test_data["question2"].fillna("None")


train_data['question1'] = train_data['question1'].astype(str)
train_data['question2'] = train_data['question2'].astype(str)
val_data['question1'] = val_data['question1'].astype(str)
val_data['question2'] = val_data['question2'].astype(str)
test_data['question1'] = test_data['question1'].astype(str)
test_data['question2'] = test_data['question2'].astype(str)

textPreprocessor = TextPreprocessor(n_jobs=-1, contractions=contractions,
                 stop_words=refined_stop_words)
    
train_data['question1'] = textPreprocessor.transform(train_data['question1'])
train_data['question2'] = textPreprocessor.transform(train_data['question2'])
val_data['question1'] = textPreprocessor.transform(val_data['question1'])
val_data['question2'] = textPreprocessor.transform(val_data['question2'])
test_data['question1'] = textPreprocessor.transform(test_data['question1'])
test_data['question2'] = textPreprocessor.transform(test_data['question2'])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.11 µs


In [8]:
#intsantiate the tokenizer
tokenize = Tokenizer(num_words = vocab_size, oov_token='OOV')
tokenize.fit_on_texts(np.hstack([train_data['question1'], train_data['question2'],
                                 test_data['question1'],test_data['question2']]))
vocabulary = tokenize.word_index

# sentences to sequences
train_data['sequence_1'] = tokenize.texts_to_sequences(train_data['question1'])
train_data['sequence_2'] = tokenize.texts_to_sequences(train_data['question2'])
val_data['sequence_1'] = tokenize.texts_to_sequences(val_data['question1'])
val_data['sequence_2'] = tokenize.texts_to_sequences(val_data['question2'])
test_data['sequence_1'] = tokenize.texts_to_sequences(test_data['question1'])
test_data['sequence_2'] = tokenize.texts_to_sequences(test_data['question2'])

train_padded = {}
test_padded = {}
val_padded = {}

train_padded['sequence_1'] = pad_sequences(train_data['sequence_1'], maxlen = maxlen)
train_padded['sequence_2'] = pad_sequences(train_data['sequence_2'], maxlen = maxlen)
val_padded['sequence_1'] = pad_sequences(val_data['sequence_1'], maxlen = maxlen)
val_padded['sequence_2'] = pad_sequences(val_data['sequence_2'], maxlen = maxlen)
test_padded['sequence_1'] = pad_sequences(test_data['sequence_1'], maxlen = maxlen)
test_padded['sequence_2'] = pad_sequences(test_data['sequence_2'], maxlen = maxlen)

In [9]:
class Embeddings:
    """
    When a corpus is passed, remove the words which are not in the global vocab(glove) and use most frequent vocab_size
    number of words. 
    """
    def __init__(self, embedding_dim, vocab_size):
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        
    def readEmbeddings(self, filePath):
        """
        Given a filepath of word embeddings creates and returns a dictionary of word, embedding values
        """
        # Create a dictionary for storing all {word, embedding values}
        wordToEmbeddingDict = {}
        # open the file as read only
        file = open(filePath, encoding='utf-8')
        # read all text
        for line in file:
            lineValue = line.split()
            word = lineValue[0]
            embedding = np.asarray(lineValue[1:],dtype = 'float32')
            wordToEmbeddingDict[word] = embedding
        # close the file
        file.close()
        return wordToEmbeddingDict
    
    def indexToEmbedding(self, wordToIndexDict, wordToEmbeddingDict):
        indexToEmbeddingMatrix = np.zeros((self.vocab_size+3, self.embedding_dim))
        for word, index in wordToIndexDict.items():
            if index > self.vocab_size+2:
                break
            if word in wordToEmbeddingDict.keys():
                indexToEmbeddingMatrix[index] = wordToEmbeddingDict[word]
            else:
                indexToEmbeddingMatrix[index] = np.array(np.random.uniform(-1.0, 1.0, self.embedding_dim))
        return indexToEmbeddingMatrix
    
    def indexToWord(self, wordToIndexDict):
        return {index: word for word, index in wordToIndexDict.items()}

In [10]:
embeddings = Embeddings(embedding_dim, vocab_size)
wordToEmbeddingDict = embeddings.readEmbeddings(embedding_file_path)
indexToEmbeddingMatrix = embeddings.indexToEmbedding(tokenize.word_index, wordToEmbeddingDict)

# CONV_1D model


People use convolution on signal processing for the following use cases:
- 1) Filter signals (1D audio, 2D image processing)
- 2) Check how much a signal is correlated to another
- 3) Find patterns in signals

Assuming these are the embeddings, we can use different sized kernels which with GlobalAveragePooling can find out some of the important features in the sentence. (Different sized kernels can work as n-grams capturing the features) and if two features have similar important features probably they represent similar questions. This intuition worked well but the reasoning is only based on this example and is not drawn with sufficient evidence. 

Taking the difference and product of these cnn features, could also be helpful. Suppose a question talking about "How to win a kaggle competition?" and "What does it take to win a kaggle competition?" Here we have win, kaggle and competition words, which will show quite a lot of similarity and featurization difference would be minimal.

- Things to try
Use Deep Conceptualized word embeddings

from IPython.display import Image
Image("https://leonardoaraujosantos.gitbooks.io/artificial-inteligence/content/Images/Conv1d_Manual.png")

In [11]:
# I'm also tracking auc, to understand how well this model might be able to separate the two classes
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)


First I'm trying without taking the difference and product of the kernel features, so that performance could be measured clearly

In [12]:
def CNN_Model_No_Diff(emb_matrix):
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=maxlen,
        trainable=False
    )
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Define inputs
    seq1 = Input(shape=(maxlen,))
    seq2 = Input(shape=(maxlen,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    
    # Add the magic features
    # magic_input = Input(shape=(5,))
    # magic_dense = BatchNormalization()(magic_input)
    # magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    # distance_input = Input(shape=(20,))
    # distance_dense = BatchNormalization()(distance_input)
    # distance_dense = Dense(128, activation='relu')(distance_dense)
    
    # Merge the Magic and distance features with the difference layer
    merge = concatenate([mergea, mergeb])

    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(800, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu', )(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

    # model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model = Model(inputs=[seq1, seq2], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', auroc])

    return model

In [13]:
cnn_model_no_diff = CNN_Model_No_Diff(indexToEmbeddingMatrix)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


In [14]:
cnn_model_no_diff.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 200)      2000600     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 20, 128)      25728       embedding_1[0][0]                
          

In [15]:
history = cnn_model_no_diff.fit([train_padded['sequence_1'],train_padded['sequence_2']], train_y, class_weight={0:0.64,1:0.36})

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


In [16]:
cnn_model_no_diff.evaluate([val_padded['sequence_1'], val_padded['sequence_2']], val_y)



[0.48236716873316304, 0.7526033292889062, 0.8430103278137115]

In [17]:
cnn_predictions = cnn_model_no_diff.predict([test_padded['sequence_1'],test_padded['sequence_2']])
submission = pd.DataFrame(index = test_data.index, columns = ['test_id', 'is_duplicate'])
submission['test_id'] = test_data.index
submission['is_duplicate'] = cnn_predictions
submission.to_csv('cnn_no_diff_submission.csv', index=False)
del cnn_model_no_diff, submission

# Model with filter dim == glove dim

In [18]:
# CNN model with filter sizes = glove dimensions 

def CNN_Model_glove_dim(emb_matrix):
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=maxlen,
        trainable=False
    )
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=emb_matrix.shape[1], padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=emb_matrix.shape[1], padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=emb_matrix.shape[1], padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=emb_matrix.shape[1], padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=emb_matrix.shape[1], padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=emb_matrix.shape[1], padding='same', activation='relu')

    # Define inputs
    seq1 = Input(shape=(maxlen,))
    seq2 = Input(shape=(maxlen,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4*128+2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4*128+2*32,))([mergea, mergeb])

    # Add the magic features
    # magic_input = Input(shape=(5,))
    # magic_dense = BatchNormalization()(magic_input)
    # magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    # distance_input = Input(shape=(20,))
    # distance_dense = BatchNormalization()(distance_input)
    # distance_dense = Dense(128, activation='relu')(distance_dense)
    
    # Merge the Magic and distance features with the difference layer
    merge = concatenate([diff, mul])

    # The MLP that determines the outcome
    x = Dropout(0.4)(merge)
    x = BatchNormalization()(x)
    x = Dense(600, activation='relu')(x)

    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu', )(x)

    x = Dropout(0.25)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

    # model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model = Model(inputs=[seq1, seq2], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', auroc])

    return model

In [19]:
cnn_model_glove_dim = CNN_Model_glove_dim(indexToEmbeddingMatrix)

In [20]:
cnn_model_glove_dim.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 20, 200)      2000600     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 20, 128)      5120128     embedding_2[0][0]                
          

In [21]:
# plot_model(cnn_model, to_file="model.png", show_shapes=True)  
# Image(filename='model.png') 

In [22]:
history = cnn_model_glove_dim.fit([train_padded['sequence_1'],train_padded['sequence_2']], train_y, class_weight={0:0.64,1:0.36})

Epoch 1/1


In [23]:
cnn_model_glove_dim.evaluate([val_padded['sequence_1'], val_padded['sequence_2']], val_y)



[0.45813742315094846, 0.7667639565717287, 0.8754353767097552]

In [24]:
cnn_predictions = cnn_model_glove_dim.predict([test_padded['sequence_1'],test_padded['sequence_2']])
submission = pd.DataFrame(index = test_data.index, columns = ['test_id', 'is_duplicate'])
submission['test_id'] = test_data.index
submission['is_duplicate'] = cnn_predictions
submission.to_csv('conv_submission_with_glove.csv', index=False)

In [25]:
del cnn_model, submission, cnn_predictions

NameError: name 'cnn_model' is not defined

# Max pooling

In [26]:
# CNN model with maxpool

def CNN_Model_max_pool(emb_matrix):
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=maxlen,
        trainable=False
    )
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')


    # Define inputs
    seq1 = Input(shape=(maxlen,))
    seq2 = Input(shape=(maxlen,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalMaxPooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalMaxPooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalMaxPooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalMaxPooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalMaxPooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalMaxPooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalMaxPooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalMaxPooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalMaxPooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalMaxPooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalMaxPooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalMaxPooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])

    # Add the magic features
    # magic_input = Input(shape=(5,))
    # magic_dense = BatchNormalization()(magic_input)
    # magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    # distance_input = Input(shape=(20,))
    # distance_dense = BatchNormalization()(distance_input)
    # distance_dense = Dense(128, activation='relu')(distance_dense)
    
    # Merge the Magic and distance features with the difference layer
    merge = concatenate([diff, mul])

    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(100, activation='relu', )(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

    # model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model = Model(inputs=[seq1, seq2], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', auroc])

    return model

In [27]:
cNN_Model_max_pool = CNN_Model_max_pool(indexToEmbeddingMatrix)
cNN_Model_max_pool.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 20, 200)      2000600     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_13 (Conv1D)              (None, 20, 128)      25728       embedding_3[0][0]                
          

In [28]:
history = cNN_Model_max_pool.fit([train_padded['sequence_1'],train_padded['sequence_2']], train_y, class_weight={0:0.64,1:0.36})
cNN_Model_max_pool.evaluate([val_padded['sequence_1'], val_padded['sequence_2']], val_y)
cnn_predictions = cNN_Model_max_pool.predict([test_padded['sequence_1'],test_padded['sequence_2']])
submission = pd.DataFrame(index = test_data.index, columns = ['test_id', 'is_duplicate'])
submission['test_id'] = test_data.index
submission['is_duplicate'] = cnn_predictions
submission.to_csv('conv_submission_max_pool.csv', index=False)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1


In [29]:
del submssion, cNN_Model_max_pool, cnn_predictions

NameError: name 'submssion' is not defined

# biGRU with attention

In [30]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb
from keras.layers import *
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [31]:
def RNN_Model(embedding_matrix):
    input_1 = Input(shape=(maxlen,))
    input_2 = Input(shape=(maxlen,))
    
    embedding_layer = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=maxlen,
        trainable=False)
    
    gru_layer = Bidirectional(GRU(64, return_sequences=True))
    
    attention_layer = Attention(maxlen)
    attention_output_1 = attention_layer(gru_layer(embedding_layer(input_1)))
    attention_output_2 = attention_layer(gru_layer(embedding_layer(input_2)))
    
    
    concat_layer = concatenate([attention_output_1, attention_output_2])
    
    x = Dropout(0.2)(concat_layer)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

    model = Model([input_1,input_2],pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model
    

In [32]:
rnn_model = RNN_Model(indexToEmbeddingMatrix)

In [33]:
rnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 20, 200)      2000600     input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 20, 128)      101760      embedding_4[0][0]                
          

In [34]:
rnn_history = rnn_model.fit([train_padded['sequence_1'],train_padded['sequence_2']], train_y, class_weight={0:0.64,1:0.36},
             validation_data=([val_padded['sequence_1'], val_padded['sequence_2']], val_y), verbose=1)

Train on 323432 samples, validate on 80858 samples
Epoch 1/1


In [35]:
rnn_model.history['acc'], rnn_model.history['val_acc']
rnn_model.history['loss'], rnn_model.history['val_loss']

TypeError: 'History' object is not subscriptable

In [36]:
rnn_predictions = rnn_model.predict([test_padded['sequence_1'],test_padded['sequence_2']])
submission = pd.DataFrame(index = test_data.index, columns = ['test_id', 'is_duplicate'])
submission['test_id'] = test_data.index
submission['is_duplicate'] = rnn_predictions
submission.to_csv('rnn_submission.csv', index=False)

In [37]:
del rnn_model, submission, train_data, train_y, val_data, val_y

# BERT

This model was previously trained and submitted, though the log loss on train data went upto 0.11 it was 0.24 on test data. Yet this model performs considerably well and doesn't overfit the data

In [38]:
bert_predictions = pd.read_csv("../input/submission-bert-on-quora/submission.csv")

# Combined model using accuracy

In [39]:
collected = gc.collect() 

NameError: name 'gc' is not defined

In [40]:
cnn_weightage = 0.4
rnn_weightage = 0.1
bert_weightage = 0.5

combined_csv = pd.DataFrame(index = test_data.index, columns = ['test_id', 'is_duplicate'])
combined_csv['test_id'] = test_data.index
combined_csv['is_duplicate'] = bert_predictions.is_duplicate*bert_weightage + cnn_predictions*cnn_weightage + rnn_predictions*rnn_weightage
combined_csv[combined_csv['is_duplicate']<=0.001].is_duplicate = 0
combined_csv[combined_csv['is_duplicate']>=0.99].is_duplicate = 1
combined_csv.to_csv('combined.csv', index=False)                 

MemoryError: 