In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/DATA")
!ls

Mounted at /content/drive
Emotions.txt	   ner.csv	       Quora_question_pairs_test.csv
glove.6B.200d.txt  ner_dataset.csv     Quora_question_pairs_train.csv
glove.6B.50d.txt   NHC_data_Biju.xlsx  theta.txt
model.png	   outer-model.png


# Loading Data

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda
from tensorflow.keras.optimizers import RMSprop
from keras.optimizers import Adam
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.python.keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
from PIL import Image, ImageFont, ImageDraw
import random
from sklearn.model_selection import train_test_split

In [3]:
df_train = pd.read_csv('Quora_question_pairs_train.csv', index_col=False)

In [4]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
df_train.shape

(404290, 6)

In [6]:
df_train.is_duplicate.value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [7]:
df_train = df_train.drop(['id','qid1','qid2'],axis=1)

In [8]:
td_index = (df_train['is_duplicate'] == 1).to_numpy()
df_train = df_train[td_index]

In [9]:
df_train.shape

(149263, 3)

In [10]:
df_train.head()

Unnamed: 0,question1,question2,is_duplicate
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,How can I be a good geologist?,What should I do to be a great geologist?,1
11,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,What was your first sexual experience like?,What was your first sexual experience?,1


# Text Preprocessing 

In [11]:
import string
import re
import os
import nltk
#nltk.download('stopwords')
nltk.download('punkt')
#from nltk.corpus import stopwords
#stopwords_english = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
def process_text(text):
    '''
    Input: 
        text: a string containing a text
    Output:
        text_clean: a list of words containing the processed text
    
    '''
    # remove number 
    text = re.sub('[0-9]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove old style text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    # remove the dates like Mar 30 2013
    text = re.sub('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2}\s\d{4}', ' ', text)
    text = re.sub(r"[/(){}\[\]\|,;.:\-\'\"$^]", '', text)
 
    #text = " ".join(word for word in text.split() if word not in stopwords_english)
    
    return  text

In [13]:
df_train['q1'] = df_train['question1'].apply(str).apply(process_text)
df_train['q2'] = df_train['question2'].apply(str).apply(process_text)


In [14]:
df_train.head()

Unnamed: 0,question1,question2,is_duplicate,q1,q2
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1,Astrology I am a Capricorn Sun Cap moon and ca...,Im a triple Capricorn Sun Moon and ascendant i...
7,How can I be a good geologist?,What should I do to be a great geologist?,1,How can I be a good geologist?,What should I do to be a great geologist?
11,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,How do I read and find my YouTube comments?,How can I see all my Youtube comments?
12,What can make Physics easy to learn?,How can you make physics easy to learn?,1,What can make Physics easy to learn?,How can you make physics easy to learn?
13,What was your first sexual experience like?,What was your first sexual experience?,1,What was your first sexual experience like?,What was your first sexual experience?


In [15]:
df_train = df_train.drop(['question1','question2'],axis=1)

In [16]:
Q1_train = np.array(df_train['q1'])
Q2_train = np.array(df_train['q2'])

In [17]:
Q1_train.shape

(149263,)

In [18]:
## Building Vocab
from collections import defaultdict

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1
vocab['<UNK>'] = 0

for idx in range(len(Q1_train)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train[idx])
    Q2_train[idx] = nltk.word_tokenize(Q2_train[idx])
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

The length of the vocabulary is:  39090


In [19]:
print(vocab['<PAD>'])
print(vocab['Astrology'])
print(vocab['<UNK>'])

1
3
0


In [20]:
Q1_train[1]

['How', 'can', 'I', 'be', 'a', 'good', 'geologist', '?']

In [21]:
# Converting questions to array of integers
for i in range(len(Q1_train)):
    Q1_train[i] = [vocab[word] for word in Q1_train[i]]
    Q2_train[i] = [vocab[word] for word in Q2_train[i]]

In [22]:
#Q1_train = np.array(Q1_train)
#Q2_train = np.array(Q2_train)

In [23]:
Q1_train[1:4]

array([list([27, 28, 4, 29, 6, 30, 31, 19]),
       list([27, 33, 4, 36, 11, 37, 38, 39, 40, 19]),
       list([25, 28, 44, 45, 46, 34, 47, 19])], dtype=object)

In [24]:
Q1_train = pad_sequences(Q1_train, maxlen = 20, padding = 'post', value=1)
Q2_train = pad_sequences(Q2_train, maxlen = 20, padding = 'post', value=1)

In [25]:
Q1_train.shape

(149263, 20)

In [26]:
Q1_train[1:5]

array([[27, 28,  4, 29,  6, 30, 31, 19,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1],
       [27, 33,  4, 36, 11, 37, 38, 39, 40, 19,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1],
       [25, 28, 44, 45, 46, 34, 47, 19,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1],
       [25, 50, 51, 52, 53, 54, 55, 19,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1]], dtype=int32)

# Siamese Network

In [27]:
X_train_q1,X_valid_q1, X_train_q2, X_valid_q2 = train_test_split(Q1_train, Q2_train, test_size=0.20, random_state=12)
print(X_train_q1.shape, X_train_q2.shape)
print(X_valid_q1.shape, X_valid_q2.shape)

(119410, 20) (119410, 20)
(29853, 20) (29853, 20)


In [28]:
X_train_q1[1]

array([  25,   82,   73,  206, 1308,  736,   19,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32)

In [41]:
import random as rnd
def data_generator(Q1, Q2, batch_size, shuffle=True):
    """Generator function that yields batches of data

    Args:
        Q1 (list): List of transformed (to tensor) questions.
        Q2 (list): List of transformed (to tensor) questions.
        batch_size (int): Number of elements per batch.
        pad (int, optional): Pad character from the vocab. Defaults to 1.
        shuffle (bool, optional): If the batches should be randomnized or not. Defaults to True.
    Yields:
        tuple: Of the form (input1, input2) with types (numpy.ndarray, numpy.ndarray)
        NOTE: input1: inputs to your model [q1a, q2a, q3a, ...] i.e. (q1a,q1b) are duplicates
              input2: targets to your model [q1b, q2b,q3b, ...] i.e. (q1a,q2i) i!=a are not duplicates
    """

    input1 = []
    input2 = []
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]
    
    if shuffle:
        rnd.shuffle(question_indexes)
    
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    while True:
        if idx >= len_q:
            # if idx is greater than or equal to len_q, set idx accordingly 
            # (Hint: look at the instructions above)
            idx = len_q
            # shuffle to get random batches if shuffle is set to True
            if shuffle:
                rnd.shuffle(question_indexes)
        
        # get questions at the `question_indexes[idx]` position in Q1 and Q2
        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]
        
        # increment idx by 1
        idx += 1
        # append q1
        input1.append(q1)
        # append q2
        input2.append(q2)
        if len(input1) == batch_size:
            # determine max_len as the longest question in input1 & input 2
            # Hint: use the `max` function. 
            # take max of input1 & input2 and then max out of the two of them.
            #max_len = max(max([len(q) for q in input1]),max([len(q) for q in input2]))
            # pad to power-of-2 (Hint: look at the instructions above)
            #max_len = 2**int(np.ceil(np.log2(max_len)))
            b1 = []
            b2 = []
            for q1, q2 in zip(input1, input2):
                # add [pad] to q1 until it reaches max_len
                q1 = q1 
                # add [pad] to q2 until it reaches max_len
                q2 = q2 
                # append q1
                b1.append(q1)
                # append q2
                b2.append(q2)
            # use b1 and b2
            yield [np.array(b1), np.array(b2)]
    ### END CODE HERE ###
            # reset the batches
            input1, input2 = [], []  # reset the batches 

In [42]:
batch_size = 2
res1, res2 = next(data_generator(X_train_q1, X_train_q2, batch_size))
print("First questions  : ",'\n', res1, '\n')
print("Second questions : ",'\n', res2)

First questions  :  
 [[  27   14    6  386  825   51  397 3832   19    1    1    1    1    1
     1    1    1    1    1    1]
 [ 269  150   73  206 1381   60    6 1545 2550   24  293   19   81   19
     1    1    1    1    1    1]] 

Second questions :  
 [[  27   14    6  386  825   48   34 3832   51  397   19    1    1    1
     1    1    1    1    1    1]
 [ 269  150   73  206 3070   34   33 1545 2550   24  293   19    1    1
     1    1    1    1    1    1]]


In [43]:
#for i in range(len(X_train_q1)):
    #X_train_q1[i] = tf.convert_to_tensor(X_train_q1[i], dtype='float32')
    #X_train_q2[i] = tf.convert_to_tensor(X_train_q2[i], dtype='float32')


In [44]:
max_len = 20
def base_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=len(vocab), output_dim=128, input_length=max_len))

    # Add LSTM
    model.add(LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(LSTM(128))
    # Add timeDistributed Layer
    model.add(Dense(128, activation="relu"))
    return model              

In [45]:
#plot_model(base_model, show_shapes=True, show_layer_names=True)

In [46]:
def TripletLossFn(x):
    
    
    v1, v2 = x

    #v1 = tf.cast(v1, dtype='float32')
    #v2 = tf.cast(v2, dtype='float32')
    margin=0.25
    
    scores = tf.matmul(v1,tf.transpose(v2))
    #scores = tf.cast(scores, dtype='float32') 
    #print('1', scores)

    batch_size = 32
    #print('2', batch_size)
    
    positive = tf.linalg.diag_part(scores)
    #positive = tf.cast(positive, dtype='float32')
    #print('3', positive)  

    negative_without_positive = scores - 2.0 * tf.experimental.numpy.eye(batch_size, dtype='float32')
    #negative_without_positive = tf.cast(negative_without_positive, dtype='float32')
    #print('4', negative_without_positive)
     
    closest_negative = tf.math.reduce_max(negative_without_positive, axis=1)
    #closest_negative = tf.cast(closest_negative, dtype='float32')
    #print('5', closest_negative)
    
    negative_zero_on_duplicate = scores * (1.0 - tf.experimental.numpy.eye(batch_size, dtype='float32'))
    #negative_zero_on_duplicate = tf.cast(negative_zero_on_duplicate, dtype='float32')
    #print('6', negative_zero_on_duplicate)
    
    mean_negative = tf.math.reduce_sum(negative_zero_on_duplicate, axis=1) / (batch_size-1)
    #mean_negative = tf.cast(mean_negative, dtype='float32')
    #print('7', mean_negative)

    triplet_loss1 = tf.math.maximum(0.0, margin - positive + closest_negative)
    #print('8', triplet_loss1)
 
    triplet_loss2 = tf.math.maximum(0.0, margin - positive + mean_negative)
    #print('9', triplet_loss2)

    triplet_loss = tf.math.reduce_mean(triplet_loss1 + triplet_loss2)
    #triplet_loss = tf.cast(triplet_loss, dtype='float32')
    #print('10', triplet_loss)
    
 
    
    return triplet_loss

In [47]:
def complete_model(base_model):
    
    input_1 = Input(shape=(max_len,))
    input_2 = Input(shape=(max_len,))
   
    v1 = base_model(input_1)
    v2 = base_model(input_2)
   
    output = Lambda(TripletLossFn, output_shape=(1,))([v1, v2])

    model = Model(inputs=[input_1, input_2], outputs=output)
    model.compile(loss=TripletLossFn, optimizer=Adam())
    return model



In [48]:
base_model = base_model()
model = complete_model(base_model)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 128)          5283200     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               ()                   0           sequential_1[0][0]         

In [49]:
#plot_model(model, show_shapes=True, show_layer_names=True)

In [50]:
batch_size = 32
train_generator = data_generator(X_train_q1, X_train_q2, batch_size)
val_generator = data_generator(X_valid_q1, X_valid_q2, batch_size)

In [51]:
history = model.fit_generator(train_generator, 
                    validation_data=val_generator, 
                    epochs=20)


Epoch 1/20




ValueError: ignored