## Importing libraries 

In [1]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import gensim.downloader
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, GRU, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint

2022-11-07 18:05:41.176724: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-07 18:05:41.750036: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-07 18:05:41.750095: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-07 18:05:41.816623: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-07 18:05:43.113192: W tensorflow/stream_executor/platform/de

## Bases importing

In [2]:
qa_base = pd.read_csv('./insurance_qna_dataset.csv',sep='\t',index_col=0)
questions_number = qa_base.shape[0]

qm_base = pd.read_csv("Test questions dataset.csv")
test_questions_number = qm_base.shape[0]

# Load training and test set

df_train = pd.read_csv("train.csv").iloc[:1000, :]
df_test = pd.read_csv("test.csv").iloc[:500, :]

# qa_base.head()
df_train.head()

  df_test = pd.read_csv("test.csv").iloc[:500, :]


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
df_test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


## Global variables

In [4]:
# File paths
TRAIN_CSV = "./test.csv"
TEST_CSV = "./test.csv"
MODEL_SAVING_DIR = "./Sprint 3/"

## Create embedding matrix

In [5]:
stops = set(stopwords.words('english'))

def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

# Prepare embedding
vocabulary = dict()
inverse_vocabulary = []  # '' will never be used, it is only a placeholder for the [0, 0, ....0] embedding

questions_cols = ['question1', 'question2']

# Iterate over the questions only of both training and test datasets
for dataset in [df_train, df_test]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):

                # Check for unwanted words

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.at[index, question] =  q2n

## Prepare training and validation data

In [6]:
max_seq_length = max(df_train.question1.map(lambda x: len(x)).max(),
                     df_train.question2.map(lambda x: len(x)).max(),
                     df_test.question1.map(lambda x: len(x)).max(),
                     df_test.question2.map(lambda x: len(x)).max())

X = df_train[questions_cols]
Y = df_train['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.1)

# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': df_test.question1, 'right': df_test.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

print(X_train)
# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

    
# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

{'left': 331    [0, 118, 2, 1585, 66, 1586, 13, 2, 1851, 1852,...
875    [17, 625, 6, 1224, 3603, 8, 2688, 66, 2703, 17...
487    [0, 1, 2477, 2478, 66, 43, 100, 338, 347, 2477...
457           [0, 118, 2, 601, 700, 80, 1220, 2362, 150]
295    [0, 167, 38, 2, 1716, 13, 1717, 1253, 66, 1104...
                             ...                        
88              [58, 1, 2, 198, 672, 673, 674, 675, 676]
390    [0, 1, 2, 552, 412, 822, 2085, 2086, 66, 2087,...
958    [27, 3821, 28, 17, 1153, 8, 3822, 1534, 164, 3...
120               [43, 31, 216, 137, 855, 539, 856, 857]
183                    [27, 100, 17, 50, 36, 1182, 1183]
Name: question1, Length: 900, dtype: object, 'right': 331    [0, 118, 2, 1585, 66, 1586, 13, 1089, 36, 1853...
875    [17, 44, 3607, 66, 17, 176, 6, 7, 254, 8, 988,...
487                           [0, 1, 2477, 2478, 8, 899]
457    [0, 118, 2, 601, 700, 102, 28, 136, 94, 2, 502...
295    [0, 167, 38, 2, 1716, 13, 2, 1719, 13, 2, 1720...
                         

## Contrastive loss function

In [7]:
def loss(margin):
    """Provides 'constrastive_loss' an enclosing scope with variable 'margin'.

    Arguments:
        margin: Integer, defines the baseline for distance for which pairs
                should be classified as dissimilar. - (default is 1).

    Returns:
        'constrastive_loss' function with data ('margin') attached.
    """

    # Contrastive loss = mean( (1-true_value) * square(prediction) +
    #                         true_value * square( max(margin-prediction, 0) ))
    def contrastive_loss(y_true, y_pred):
        """Calculates the constrastive loss.

        Arguments:
            y_true: List of labels, each label is of type float32.
            y_pred: List of predictions of same length as of y_true,
                    each label is of type float32.

        Returns:
            A tensor containing constrastive loss as floating point value.
        """
        y_true = tf.cast(y_true, tf.float32)
        #print(y_true)
        #print(y_pred)
        
        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss

## Build the model

In [11]:
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 10

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_dim = 25

embedding_layer = Embedding(len(vocabulary), embedding_dim, input_length=max_seq_length) #len(embeddings) = len(vocabulary)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_gru = GRU(n_hidden)

left_output = shared_gru(encoded_left)
right_output = shared_gru(encoded_right)

# Calculates the distance as defined by the MaLSTM model
magru_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
magru = Model([left_input, right_input], [magru_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

magru.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

#callback checkpoint (save best model)
model_checkpoint_callback = ModelCheckpoint(
    filepath='./igor/checkpoint',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Start training
training_start_time = time()

# magru_trained = magru.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, epochs=n_epoch,
#                             validation_data=([X_validation['left'], X_validation['right']], Y_validation), callbacks=[model_checkpoint_callback])

print(X_train['left'])
print(X_train['right'])

# print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

[[   0    0    0 ... 1851 1852 1021]
 [   0    0    0 ...   17   44 3606]
 [   0    0    0 ... 2478    8  899]
 ...
 [   0    0    0 ... 3823  374  643]
 [   0    0    0 ...  539  856  857]
 [   0    0    0 ...   36 1182 1183]]
[[   0    0    0 ... 1853 1851 1852]
 [   0    0    0 ...   87   17 1224]
 [   0    0    0 ... 2478    8  899]
 ...
 [   0    0    0 ... 3824  374  643]
 [   0    0    0 ...  858    6  320]
 [   0    0    0 ...  102    2 1185]]


## Plotting the results

In [None]:
# Plot accuracy

plt.plot(malstm_trained.history['accuracy'])

plt.plot(malstm_trained.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()