In [1]:
pip install trax==1.3.1

Collecting trax==1.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/fe/d8/ad90a5c79804561bbbc5fd65a4cb6b6e735370225e777cfc46980a9dc479/trax-1.3.1-py2.py3-none-any.whl (347kB)
[K     |█                               | 10kB 23.0MB/s eta 0:00:01[K     |█▉                              | 20kB 4.4MB/s eta 0:00:01[K     |██▉                             | 30kB 5.2MB/s eta 0:00:01[K     |███▊                            | 40kB 6.0MB/s eta 0:00:01[K     |████▊                           | 51kB 4.9MB/s eta 0:00:01[K     |█████▋                          | 61kB 5.7MB/s eta 0:00:01[K     |██████▋                         | 71kB 6.2MB/s eta 0:00:01[K     |███████▌                        | 81kB 6.5MB/s eta 0:00:01[K     |████████▌                       | 92kB 7.0MB/s eta 0:00:01[K     |█████████▍                      | 102kB 6.6MB/s eta 0:00:01[K     |██████████▍                     | 112kB 6.6MB/s eta 0:00:01[K     |███████████▎                    | 122kB 6.6MB/s e

# **1. Import modules**

In [2]:
import os
import trax
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import random as rnd
from collections import defaultdict
from google.colab import files
trax.supervised.trainer_lib.init_random_number_generators(34)
rnd.seed(34)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
uploaded = files.upload()

Saving questions.csv to questions (1).csv


# **2. Loading and splitting the Dataset**

In [7]:
data = pd.read_csv('questions (1).csv')
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
print('Number of question pairs in the dataset:', len(data))

Number of question pairs in the dataset: 404351


In [9]:
train_size = 300000
test_size = 10 * 1024
train_data = data[:train_size]
test_data = data[train_size:train_size+test_size]
print('Training data size:', len(train_data), "\n")
print('Test data size:', len(test_data))

Training data size: 300000 

Test data size: 10240


In [10]:
duplicates = (train_data['is_duplicate'] == 1).to_numpy()
duplicates_index = [i for i, x in enumerate(duplicates) if x]
print('Total number of duplicate questions in the dataset:', len(duplicates_index))
print('Index of first ten duplicate questions:', duplicates_index[:10], '\n')

non_duplicates = (train_data['is_duplicate'] == 0).to_numpy()
non_duplicates_index = [i for i, x in enumerate(non_duplicates) if x]
print('Total number of non duplicate questions in the dataset:', len(non_duplicates_index))
print('Index of first ten non duplicate questions:', non_duplicates_index[:10], '\n')

print('Duplicate question sample from the TRAINING dataset')
print('-'*100)
print('Question 1:', train_data['question1'][5], '\n')
print('Question 2:', train_data['question2'][5], '\n')
print('is_duplicate:', train_data['is_duplicate'][5], '\n')

print('Non duplicate question sample from the TRAINING dataset')
print('-'*100)
print('Question 1:', train_data['question1'][0], '\n')
print('Question 2:', train_data['question2'][0], '\n')
print('is_duplicate:', train_data['is_duplicate'][0], '\n')

print('Sample question pairs from the TEST dataset')
print('-'*100)
#Test set start from index 300000
print('Question 1:', test_data['question1'][300000], '\n')
print('Question 2:', test_data['question2'][300000], '\n')
print('is_duplicate:', test_data['is_duplicate'][300000], '\n')


Total number of duplicate questions in the dataset: 111486
Index of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29] 

Total number of non duplicate questions in the dataset: 188514
Index of first ten non duplicate questions: [0, 1, 2, 3, 4, 6, 8, 9, 10, 14] 

Duplicate question sample from the TRAINING dataset
----------------------------------------------------------------------------------------------------
Question 1: Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

Question 2: I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

is_duplicate: 1 

Non duplicate question sample from the TRAINING dataset
----------------------------------------------------------------------------------------------------
Question 1: What is the step by step guide to invest in share market in india? 

Question 2: What is the step by step guide to invest in share market? 

is_duplicate: 0 

Sample questi

In [11]:
Q1_train_words = np.array(train_data['question1'][duplicates_index])
Q2_train_words = np.array(train_data['question2'][duplicates_index])

Q1_test_words = np.array(test_data['question1'])
Q2_test_words = np.array(test_data['question2'])
y_test = np.array(test_data['is_duplicate'])

In [None]:
Q1_train_words

array(['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
       'How can I be a good geologist?',
       'How do I read and find my YouTube comments?', ...,
       'What are the top 10 TV series one should genuinely watch?',
       'Is there no life on other planets?',
       'How do I tell the difference between infatuation and love?'],
      dtype=object)

In [None]:
Q2_train_words

array(["I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",
       'What should I do to be a great geologist?',
       'How can I see all my Youtube comments?', ...,
       'Which TV series should are worth watching?',
       'Is there life on other planets?',
       'What is the difference between love and infatuation?'],
      dtype=object)

In [12]:
#create arrays with same shape and same type
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)


In [13]:
#Build vocabulary with train set
vocab = defaultdict(lambda:0) #if word not in vocab then return 0
vocab['<PAD>'] = 1

#tokenize training set
for id in range(len(Q1_train_words)):
  Q1_train[id] = nltk.word_tokenize(Q1_train_words[id])
  Q2_train[id] = nltk.word_tokenize(Q2_train_words[id])
  q = Q1_train[id] + Q2_train[id]
  for word in q:
    if word not in vocab:
      vocab[word] = len(vocab) + 1

print('Number of words or length of vocabulary:', len(vocab), '\n')

print('sample index of words in vocabulary:', '\n')
print(vocab['<PAD>'])
print(vocab['cse'])
print(vocab['market'])

Number of words or length of vocabulary: 36342 

sample index of words in vocabulary: 

1
11522
535


In [14]:
print('Original question before tokenizing in training set:', Q1_train_words[1], '\n')
print('After tokenizing:',Q1_train[1])

Original question before tokenizing in training set: How can I be a good geologist? 

After tokenizing: ['How', 'can', 'I', 'be', 'a', 'good', 'geologist', '?']


In [15]:
#tokenize test set
for id in range(len(Q1_test_words)):
  Q1_test[id] = nltk.word_tokenize(Q1_test_words[id])
  Q2_test[id] = nltk.word_tokenize(Q2_test_words[id])

print('Original question before tokenizing in test set:', Q1_test_words[1], '\n')
print('After tokenizing:',Q1_test[1])

Original question before tokenizing in test set: What is the best bicycle to buy under 10k? 

After tokenizing: ['What', 'is', 'the', 'best', 'bicycle', 'to', 'buy', 'under', '10k', '?']


# **Convert question to tensor**

In [16]:
for i in range(len(Q1_train)):
  Q1_train[i] = [vocab[word] for word in Q1_train[i]]
  Q2_train[i] = [vocab[word] for word in Q2_train[i]]

for i in range(len(Q1_test)):
  Q1_test[i] = [vocab[word] for word in Q1_test[i]]
  Q2_test[i] = [vocab[word] for word in Q2_test[i]]

In [17]:
print('Question in train set:')
print(Q1_train_words[0])
print('Encoded version:', Q1_train[0], '\n')

print('Question in test set:', Q1_test_words[0])
print('Encoded version:', Q1_test[0])


Question in train set:
Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Encoded version: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

Question in test set: How do I prepare for interviews for cse?
Encoded version: [32, 38, 4, 107, 65, 1015, 65, 11522, 21]


In [18]:
#split training set into train and validation set
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]

print('Length of training set:', len(train_Q1))
print('Length of validation set:', len(val_Q1))

Length of training set: 89188
Length of validation set: 22298


# **Implementing Data Generator**

In [19]:
def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):
  input1 = []
  input2 = []
  idx = 0
  len_q = len(Q1)
  question_indexes = [*range(len_q)]

  if shuffle:
    rnd.shuffle(question_indexes)
  
  while True:
    if idx >= len_q:
      idx = 0

      if shuffle:
        rnd.shuffle(question_indexes)
      
    q1 = Q1[question_indexes[idx]]
    q2 = Q2[question_indexes[idx]]

    idx += 1
    input1.append(q1)
    input2.append(q2)

    if len(input1) == batch_size:
      max_len = max(max([len(q) for q in input1]), max([len(q) for q in input2]))
      max_len = 2**int(np.ceil(np.log2(max_len)))
      b1 = []
      b2 = []
      for q1, q2 in zip(input1, input2):
        q1 = q1 + [pad] * (max_len - len(q1))
        q2 = q2 + [pad] * (max_len - len(q2))
        b1.append(q1)
        b2.append(q2)
      yield np.array(b1), np.array(b2)

      input1, input2 = [], [] #reset the batches

In [20]:
batch_size = 2
res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size))
print('Two questions from train_Q1 are:', res1, '\n')
print('Two questions from train_Q2 are:', res2, '\n')

Two questions from train_Q1 are: [[  30   87   78  134 2131 1980   28   78  594   21    1    1    1    1
     1    1]
 [  30   55   78 3540 1460   28   56  253   21    1    1    1    1    1
     1    1]] 

Two questions from train_Q2 are: [[  30  156   78  134 2131 9516   21    1    1    1    1    1    1    1
     1    1]
 [  30  156   78 3540 1460  131   56  253   21    1    1    1    1    1
     1    1]] 



# **Constructing Siamese Model**

In [74]:
def siamese(vocab_size = len(vocab), d_model = 128, mode = 'train'):

  def normalize(x):  # normalizes the vectors to have L2 norm 1
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))
  
  q_processor = tl.Serial(
       tl.Embedding(vocab_size = vocab_size, d_feature = d_model),
       tl.LSTM(n_units=d_model),
       tl.Mean(axis=1),
       tl.Fn('Normalize', lambda x: normalize(x))
  )

  #run Q1 and Q2 in parallel
  model = tl.Parallel(q_processor, q_processor)
  return model

In [75]:
model = siamese()
print(model)

Parallel_in2_out2[
  Serial[
    Embedding_41789_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_41789_128
    LSTM_128
    Mean
    Normalize
  ]
]


# **Calculating Triplet Loss**

In [76]:
def TripletLossFn(v1, v2, margin = 0.25):
  #v1 = (batch_size, model_dimension) associated with Q1
  #v2 = (batch_size, model_dimension) associated with Q2

  scores = fastnp.dot(v1, v2.T) #pairwise cosine similarity
  
  #new batch size
  batch_size = len(scores)
  
  #get diagonal entries in scores matrix for positive duplicates
  positive = fastnp.diagonal(scores)
  
  negative_without_positive = scores - 2.0 * fastnp.eye(batch_size)

  #closest negative
  closest_negative = negative_without_positive.max(axis=1)

  negative_zero_on_duplicate = scores * (1.0 - fastnp.eye(batch_size))

  #mean negative
  mean_negative = np.sum(negative_zero_on_duplicate, axis = 1) / (batch_size - 1)

  triplet_loss_1 = fastnp.maximum(0.0, margin - positive + closest_negative)

  triplet_loss_2 = fastnp.maximum(0.0, margin - positive + mean_negative)

  triplet_loss = fastnp.mean(triplet_loss_1 + triplet_loss_2)

  return triplet_loss

In [77]:
from functools import partial
def TripletLoss(margin = 0.25):
  triplet_loss_fn = partial(TripletLossFn, margin = margin)
  return tl.Fn('TripletLoss', triplet_loss_fn)

# **Generate training and validation data for training**

In [78]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab['<PAD>'])
print('train_Q1.shape', train_Q1.shape, '\n')
print('val_Q1.shape', val_Q1.shape)

train_Q1.shape (89188,) 

val_Q1.shape (22298,)


# **Training the siamese model**

In [79]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)

def train_model(siamese, TripletLoss, lr_schedule, train_generator = train_generator, val_generator = val_generator, output_dir='model/'):

  output_dir = os.path.expanduser(output_dir)

  train_task = training.TrainTask(
      labeled_data = train_generator,
      loss_layer = TripletLoss(),
      optimizer = trax.optimizers.Adam(0.01),
      lr_schedule = lr_schedule,)

  eval_task = training.EvalTask(
      labeled_data = val_generator,
      metrics = [TripletLoss()],)
  
  training_loop = training.Loop(siamese(),
                                train_task,
                                eval_task = eval_task,
                                output_dir=output_dir)

  return training_loop

In [80]:
training_steps = 1
training_loop = train_model(siamese, TripletLoss, lr_schedule)
training_loop.run(training_steps)

Step      1: train TripletLoss |  0.49796584
Step      1: eval  TripletLoss |  0.49929124


In [38]:
uploaded1 = files.upload()

Saving model.pkl.gz to model.pkl.gz


In [81]:
# Loading in the saved model
model = siamese()
model.init_from_file('model.pkl.gz')

# **Testing accuracy of your model**

In [82]:
def classify(test_Q1, test_Q2, y, threshold, model, vocab, data_generator = data_generator, batch_size = 64):

  accuracy = 0
  for i in range(0, len(test_Q1), batch_size):
    q1, q2 = next(data_generator(test_Q1[i:i+batch_size], test_Q2[i:i+batch_size], batch_size, vocab['<PAD>'], shuffle = False))

    y_test = y[i:i+batch_size]

    #call the model
    v1, v2 = model((q1, q2))

    for j in range(batch_size):

      d = np.dot(v1[j], v2[j].T)

      res = d > threshold

      accuracy += (y_test[j] == res)

  accuracy = accuracy / len(test_Q1)

  return accuracy

In [83]:
accuracy = classify(Q1_test, Q2_test, y_test, 0.7, model, vocab, batch_size = 512)
print('Accuracy:', accuracy)

Accuracy: 0.71708984375


# **Testing with your own questions**

In [84]:
def predict(question1, question2, threshold, model, vocab, data_generator = data_generator, verbose = False):

  q1 = nltk.word_tokenize(question1)
  q2 = nltk.word_tokenize(question2)
  Q1, Q2 = [], []

  for word in q1:
    Q1 += [vocab[word]]
  for word in q2:
    Q2 += [vocab[word]]

  Q1, Q2 = next(data_generator([Q1], [Q2], 1, vocab['<PAD>']))

  v1, v2 = model((Q1, Q2))

  d = np.dot(v1[0], v2[0].T)

  res = d > threshold

  if verbose:
    print("Q1  = ", Q1, "\nQ2  = ", Q2)
    print("d   = ", d)
    print("res = ", res)
  return res


In [86]:
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.5, model, vocab, verbose = True)

Q1  =  [[585  76   4  46  53  21   1   1]] 
Q2  =  [[ 585   33    4   46   53 7287   21    1]]
d   =  0.8621342
res =  True


True

In [87]:
question1 = "what is your name"
question2 = "May i know your name"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.5, model, vocab, verbose = True)

Q1  =  [[  15  156   56 1377    1    1    1    1]] 
Q2  =  [[11076   698   112    56  1377     1     1     1]]
d   =  0.5994884
res =  True


True