## 1. Import Libraries & Data

In [None]:
#!pip install sentence_transformers

In [None]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
import re
from nltk import sent_tokenize
nltk.download('punkt')

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import keras
from keras.layers import Input, LSTM, Dense, Embedding, Dropout
from keras.models import Model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/semantic\ search

/content/drive/MyDrive/semantic search


In [None]:
# example of conversations
df = pd.read_excel('data.xlsx')
df.head()

Unnamed: 0,Sentences
0,We are seeing an increasing number of errors w...
1,We need to make improvements to our landing pa...
2,It looks like the issue is limited only to vis...
3,We need to schedule a product meeting to discu...


**NB :** the following data is for the second approach 

In [None]:
# import Quora Question Pairs dataset
quora = pd.read_csv('train.csv')
quora.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## 2. Text Pre-Processing

In [None]:
def process(paragraph) :
  ps = PorterStemmer()
  corpus = []
  try :
    par = paragraph.values
  except :
    par = paragraph
  for text in par :
      #accept any uppercase or lowercase letters 
      cleaned = re.sub('[^a-zA-Z]', ' ', str(text))

      #get lowercase of all words in corpus
      
      cleaned = cleaned.lower().split()
      
      #remove stopwords + stemming 
      cleaned = [ps.stem(word) for word in cleaned if not word in stopwords.words('english')]
      cleaned = ' '.join(cleaned)
      corpus.append(cleaned)

  return corpus

## 3. First Approach = DistilBERT + Cosine Similarity

In [None]:
modelB = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

We embed our conversations :

In [None]:
paragraph = df.iloc[:, 0] 
corpus = process(paragraph)
embeddings_distilbert = modelB.encode(corpus)

100%|██████████| 4/4 [00:00<00:00, 96.18it/s]


We embed our search query :

In [None]:
search_string = "failed payments"
search_vect = modelB.encode([search_string])

let's find now the similarity between vectors using cosine similarity function :

In [None]:
def get_similar(query, conversations, k=1):
    similarity_matrix = cosine_similarity(query, conversations)

    similarities = similarity_matrix[0]
    #print(similarities)

    # we get the more relevant conversations
    if k == 1:
        return [np.argmax(similarities)]
    elif k is not None:
        return np.flip(similarities.argsort()[-k:][::1]) 

In [None]:
K = 2 # no. of conversations that has to be returned, for now we choose 2 conversations
distilbert_similar_indexes = get_similar(search_vect, embeddings_distilbert, K)

[ 0.13662694 -0.04744482  0.04145075 -0.05135897]


In [None]:
output_data = []
for index in distilbert_similar_indexes:
    output_data.append(paragraph[index])

In [None]:
output_data

['We are seeing an increasing number of errors with our payment services. The\nissue has been reported by multiple users in the last 3 hours and this is affecting\nour revenue. We need to fix it immediately',
 'It looks like the issue is limited only to visa credit cards']

### Highlight the results

Highlight the whole relevant conversation between the other ones :

In [None]:
paragraph.values

array(['We are seeing an increasing number of errors with our payment services. The\nissue has been reported by multiple users in the last 3 hours and this is affecting\nour revenue. We need to fix it immediately',
       'We need to make improvements to our landing page to convey our new\nbranding guidelines.',
       'It looks like the issue is limited only to visa credit cards',
       'We need to schedule a product meeting to discuss the new set of features and\nthe roadmap.'],
      dtype=object)

In [None]:
from termcolor import colored

for elt in paragraph.values :
    if elt in output_data:
        print(colored(elt,'white','on_green'))
    else :
        print(elt)


[42m[37mWe are seeing an increasing number of errors with our payment services. The
issue has been reported by multiple users in the last 3 hours and this is affecting
our revenue. We need to fix it immediately[0m
We need to make improvements to our landing page to convey our new
branding guidelines.
[42m[37mIt looks like the issue is limited only to visa credit cards[0m
We need to schedule a product meeting to discuss the new set of features and
the roadmap.


Highlight just the relevant sentence in search results :

In [None]:
for t in output_data :
  sentences = sent_tokenize(t)
  sentences_ = process(sentences)
  sentences_ = modelB.encode(sentences_)
  sen = get_similar(search_vect, sentences_, 1)
  rel = sentences[sen[0]]
  for sen in sentences :
    if sen == rel :
      print(colored(sen,'white','on_green'), end = ' ')
    else :
      print(sen, end = ' ')
  print('\n')

[42m[37mWe are seeing an increasing number of errors with our payment services.[0m The
issue has been reported by multiple users in the last 3 hours and this is affecting
our revenue. We need to fix it immediately 

[42m[37mIt looks like the issue is limited only to visa credit cards[0m 



## 4. Second Approach = DistilBERT + Neural Network 

First, We get BERT embeddings for all sentences in our dataset : 

In [None]:
paragraph1 = quora.iloc[:, 3]
paragraph2 = quora.iloc[:, 4]
corpus_1 = process(paragraph1)
corpus_2 = process(paragraph2)
embeddings_sen1 = model.encode(corpus_1)
embeddings_sen2 = model.encode(corpus_2)

We save it, thus we won't need to run it every time :

In [None]:
import pickle

encoding_data_file_quest1='encoding_quest1'
encoding_data_file_quest2='encoding_quest2'

with open(encoding_data_file_quest1, "wb") as fp:
		pickle.dump(vec1, fp)
  
with open(encoding_data_file_quest1, "wb") as fp:
		pickle.dump(vec1, fp)

In [None]:
with open(encoding_data_file_quest1, "rb") as fp:
		embeddings_sen1=pickle.load(fp)
	
with open(encoding_data_file_quest2, "rb") as fp:   
		embeddings_sen2=pickle.load(fp)

In [None]:
train_vec1 = np.asarray(embeddings_sen1, np.float32)
train_vec2 = np.asarray(embeddings_sen2, np.float32)
train_label = np.asarray(quora.iloc[:, 5],np.float32)

Let's create now our NN model :

In [None]:
input1 = Input(shape=(768,))
input2 = Input(shape=(768,))

# we concatenate the two inputs = embeddings for each two sentences
x = keras.layers.concatenate([input1,input2], axis=-1)

# we add a three dense layers with three dropout layers for oferfitting reasons, the last Dense layer is to return the similarity score between -1 and 1  
x = Dense(1024,activation='relu') (x)
x = Dropout(0.5) (x)
x = Dense(256,activation='relu') (x)
x = Dropout(0.5) (x)
x = Dense(64,activation='relu') (x)

output = Dense(1,activation='sigmoid') (x)

model = Model(inputs=[input1,input2],outputs=output)
model.summary()

# we compile our model
model.compile(optimizer='rmsprop',
  loss='binary_crossentropy',
  metrics=['acc'])

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 concatenate_2 (Concatenate)    (None, 1536)         0           ['input_5[0][0]',                
                                                                  'input_6[0][0]']                
                                                                                                  
 dense_5 (Dense)                (None, 1024)         1573888     ['concatenate_2[0][0]']    

Training :

In [None]:
history=model.fit([train_vec1, train_vec2], train_label, 
	epochs=35,batch_size=200,
	validation_split=0.2) # we can add an early stopping param

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


**Inference :**
We apply the model on the given example in the assignment :

In [None]:
new = np.asarray(embeddings_distilbert, np.float32)
n = len(embeddings_distilbert)
 
queries = np.asarray(list(search_vect)*n, np.float32)
preds = model.predict([new, queries], batch_size=200)
preds

array([[6.1617627e-08],
       [6.6597136e-07],
       [6.6062169e-09],
       [4.7373567e-07]], dtype=float32)

We sort the results and get the two more relevant conversations :

In [None]:
output_data_nn = []
for index in np.argsort(preds[:,0])[:K]:
    output_data_nn.append(paragraph[index])

output_data_nn

['It looks like the issue is limited only to visa credit cards',
 'We are seeing an increasing number of errors with our payment services. The\nissue has been reported by multiple users in the last 3 hours and this is affecting\nour revenue. We need to fix it immediately']

We highlight the results :

In [None]:
for elt in paragraph.values :
    if elt in output_data_nn:
        print(colored(elt,'white','on_green'))
    else :
        print(elt)

[42m[37mWe are seeing an increasing number of errors with our payment services. The
issue has been reported by multiple users in the last 3 hours and this is affecting
our revenue. We need to fix it immediately[0m
We need to make improvements to our landing page to convey our new
branding guidelines.
[42m[37mIt looks like the issue is limited only to visa credit cards[0m
We need to schedule a product meeting to discuss the new set of features and
the roadmap.


We highlight the relevant sentence in search results :

In [None]:
for t in output_data_nn :
  sentences = sent_tokenize(t)
  sentences_ = process(sentences)
  sentences_ = model.encode(sentences_)

  new = np.asarray(sentences_, np.float32)
  n = len(sentences_)
  queries = np.asarray(list(search_vect)*n, np.float32)
  preds = model.predict([new, queries], batch_size=200)
  sen = np.argmax(preds[:,0])
  rel = sentences[sen]

  for sen in sentences :
    if sen == rel :
      print(colored(sen,'white','on_green'), end = ' ')
    else :
      print(sen, end = ' ')
  print('\n')

[42m[37mIt looks like the issue is limited only to visa credit cards[0m 

[42m[37mWe are seeing an increasing number of errors with our payment services.[0m The
issue has been reported by multiple users in the last 3 hours and this is affecting
our revenue. We need to fix it immediately 

