In [None]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding,Input,LSTM,Concatenate,Dense
from keras.models import Model

from sklearn.model_selection import train_test_split

In [None]:
DATA_PATH = "./"
WNL = WordNetLemmatizer()
MAX_SENT_LEN = 15
EMBEDDING_DIM = 300
BATCH_SIZE = 32
N_EPOCHS = 10

## Process Data

In [None]:
# lemmatization 词形还原
def cutter(word):
    if len(word) < 4:
        return word
    return WNL.lemmatize(WNL.lemmatize(word, "n"), "v")

def preprocess(string):
    string = string.lower().replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'") \
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not") \
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is") \
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are") \
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own") \
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ") \
        .replace("€", " euro ").replace("'ll", " will").replace("=", " equal ").replace("+", " plus ")
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
#     string = ' '.join([cutter(w) for w in string.split()])
    return string

In [None]:
def preprocess_df(df):
    df['question1'] = df['question1'].apply(lambda x : preprocess(str(x)))
    df['question2'] = df['question2'].apply(lambda x : preprocess(str(x)))

    # discard length less than 10 characters
    df['q1_len'] = df.question1.apply(lambda x : len(x))
    df['q2_len'] = df.question2.apply(lambda x : len(x))

    # Questions having lesser than 10 characters can be discarded. 
    indices = set(df[df['q1_len']<10].index).union(df[df['q2_len']<10].index)

    # Can drop the character count columns - to save memory
    df.drop(['q1_len','q2_len'], inplace=True, axis=1)

    df.drop(indices, inplace=True)
    df.reset_index()

    ## cut every question length to MAX_SENT_LENTH
    # df['q1_wc'] = df.question1.apply(lambda x : len(x.split()))
    # df['q2_wc'] = df.question2.apply(lambda x : len(x.split()))
    # MAX_SENT_LEN = int(max(np.percentile(df.q1_wc, 80),np.percentile(df.q2_wc, 80)))
    
    
def tokenize_data(df, tokenizer):
    data_1 = pad_sequences(tokenizer.texts_to_sequences(df.question1), maxlen=MAX_SENT_LEN)
    data_2 = pad_sequences(tokenizer.texts_to_sequences(df.question2), maxlen=MAX_SENT_LEN)
    return data_1, data_2

In [None]:
train_df = pd.read_table(DATA_PATH+"train.csv", sep=',')
test_df = pd.read_table(DATA_PATH+"test.csv", sep=',')

In [None]:
preprocess_df(train_df)
preprocess_df(test_df)
print("preprocess_df finished")

In [None]:
tokenizer = Tokenizer()
# tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(np.append(train_df.question1,train_df.question2))
word_index = tokenizer.word_index
print("tokenizer initialization finished")


train_data_1 = train_data_2 = train_labels =  np.array([])
train_data_1, train_data_2 = tokenize_data(train_df, tokenizer)
train_labels = np.array(train_df.is_duplicate)

test_data_1 = test_data_2 = np.array([])
test_data_1, test_data_2 = tokenize_data(test_df, tokenizer)
print("get_data_labels finished")


## Embedding

In [None]:
def read_glove_embedding(file_name, embedding_dim):
    embeddings_index = {}
    f = open(file_name)
    for line in f:
        values = line.split()
        word = values[0]
#         if len(values) == embedding_dim + 1 and word in top_words:
        if len(values) == embedding_dim + 1:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs

    f.close()
    return embeddings_index

In [None]:
embeddings = read_glove_embedding(DATA_PATH+"glove.840B.300d.txt", EMBEDDING_DIM)

In [None]:
# Create an embedding matrix containing only the word's in our vocabulary
# If the word does not have a pre-trained embedding, then randomly initialize the embedding
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(word_index)+1, EMBEDDING_DIM)) # +1 is because the matrix indices start with 0

for word, i in word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
# del embeddings

## Model

In [None]:
embedding_layer = Embedding(input_dim=len(word_index)+1,
                            output_dim=EMBEDDING_DIM,
                            weights=[embeddings_matrix],
                            input_length=MAX_SENT_LEN,
                            trainable=False,
                            mask_zero=False,
                            name='embedding_layer')
lstm_layer = LSTM(75, recurrent_dropout=0.2)

question_input_1 = Input(shape=(MAX_SENT_LEN,), dtype="int32")
embedded_1 = embedding_layer(question_input_1)
lstm_1 = lstm_layer(embedded_1)

question_input_2 = Input(shape=(MAX_SENT_LEN,), dtype="int32")
embedded_2 = embedding_layer(question_input_2)
lstm_2 = lstm_layer(embedded_2)


In [None]:
merged = Concatenate(name='q1_q2_concat')([lstm_1, lstm_2])
output_prob = Dense(units=1, 
                    activation='sigmoid', 
                    name='output_layer')(merged)
model = Model(inputs=[question_input_1, question_input_2], outputs=output_prob, name='text_pair_cnn')
model.summary()


In [None]:
X_train_q1, X_val_q1, X_train_q2, X_val_q2, y_train, y_val =  train_test_split(train_data_1,
                                                                                train_data_2,
                                                                                train_df.is_duplicate,
                                                                                random_state=10, 
                                                                                test_size=0.1)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])



model.fit(x = [X_train_q1, X_train_q2], 
          y = y_train, 
          batch_size=BATCH_SIZE, 
          epochs=N_EPOCHS, 
          validation_data=([X_val_q1, X_val_q2], y_val))
