### 4

In [1]:
path = '/Users/olang/Desktop/*/Projects/NLP/Semantic Search/sampledata/titles.csv'

In [2]:
import torch
import torch.nn.functional as F

In [3]:
import pandas as pd

df = pd.read_csv(path)
df.head()

Unnamed: 0,Title
0,Security Guard Deployment System Using Vehicul...
1,A Breast Cancer Detection System using Machine...
2,Autism Spectrum Disorder prediction in childre...
3,Energy consumption prediction and scheduling ...
4,An Intelligent Chatbot for Finance and Banking


In [4]:
df.isnull().sum()

Title    0
dtype: int64

In [5]:
# sample sentences

x = df.Title.iloc[0]
y = df.Title.iloc[1]

print(x, y)

Security Guard Deployment System Using Vehicular Movement Behaviour: Case of Residential Real Estates
 A Breast Cancer Detection System using Machine learning 


In [6]:
# create embeding for each sentence, using transformer
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
sentences = [x, y]

# Tokenize sentences
sample_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

In [8]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [9]:
# get token embeddings
with torch.no_grad():             # no grad here means no backpropagation i.e. no training
    sample_embeddings = model(**sample_input)           # the two ** is to unpack the dictionary

# get sentence embeddings, through pefroming mean pooling on token embeddings.
# this is the same as the sentence embeddings from the sentence-transformers library
sample_sent_embeddings = mean_pooling(sample_embeddings, sample_input['attention_mask'])

# normalize embeddings, i.e. make them unit vectors by dividing by their L2 norm. L2 being the euclidean distance
sample_sent_embeddings = F.normalize(sample_sent_embeddings, p=2, dim=1)

In [10]:
print("Sentence embeddings:")
print(sample_sent_embeddings.shape)
#print(sample_sent_embeddings)

Sentence embeddings:
torch.Size([2, 384])


In [11]:
# enclode all as a function

def get_sentence_emd(sentence_list):
    encoded_input = tokenizer(sentence_list, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [12]:
all_embeddings = get_sentence_emd(df.Title.tolist())

In [13]:
print(all_embeddings.shape)

torch.Size([140, 384])


In [14]:
df['Embeddings'] = all_embeddings.tolist()
df.head()

Unnamed: 0,Title,Embeddings
0,Security Guard Deployment System Using Vehicul...,"[0.0831846296787262, -0.00844046100974083, -0...."
1,A Breast Cancer Detection System using Machine...,"[-0.005373380612581968, 0.006123208440840244, ..."
2,Autism Spectrum Disorder prediction in childre...,"[0.04774220287799835, -0.08992604911327362, 0...."
3,Energy consumption prediction and scheduling ...,"[-0.04396095499396324, 0.02846721187233925, 0...."
4,An Intelligent Chatbot for Finance and Banking,"[-0.04426628351211548, 0.0009629157721064985, ..."


In [15]:
#df.to_csv('titles_embeddings.csv', index=False)

----

In [16]:
df_bkp = df.copy()

In [17]:
# plot the embeddings in 2D space

""" from sklearn.manifold import TSNE       # t-distributed stochastic neighbor embedding (t-SNE) is a technique for dimensionality reduction
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(all_embeddings)

df['tsne-2d-one'] = tsne_results[:,0]
df['tsne-2d-two'] = tsne_results[:,1]

plt.figure(figsize=(16,10))
plt.scatter(
    x=df["tsne-2d-one"], y=df["tsne-2d-two"],
    c='blue',
    s=10,
    alpha=0.5
)

plt.show() """


' from sklearn.manifold import TSNE       # t-distributed stochastic neighbor embedding (t-SNE) is a technique for dimensionality reduction\nimport matplotlib.pyplot as plt\n\ntsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)\ntsne_results = tsne.fit_transform(all_embeddings)\n\ndf[\'tsne-2d-one\'] = tsne_results[:,0]\ndf[\'tsne-2d-two\'] = tsne_results[:,1]\n\nplt.figure(figsize=(16,10))\nplt.scatter(\n    x=df["tsne-2d-one"], y=df["tsne-2d-two"],\n    c=\'blue\',\n    s=10,\n    alpha=0.5\n)\n\nplt.show() '

In [17]:
# search for similar sentences

from sklearn.metrics.pairwise import cosine_similarity

def get_similar_sentences(query, df, top_n=5):
    query_embedding = get_sentence_emd([query])
    df['similarity'] = cosine_similarity(query_embedding, df['Embeddings'].tolist())[0]
    df = df.sort_values(by=['similarity'], ascending=False)[:top_n]
    titles = df.Title.tolist()
    similarities = df.similarity.tolist()

    return titles, similarities

## Querying

In [20]:
# query = 'Deep Learning'
query = str(input('Enter a query: '))
l1, l2 = get_similar_sentences(query, df_bkp)

In [21]:
from tqdm.auto import tqdm

for i, j in tqdm(zip(l1, l2)):
    print(query)
    print("Title : ", i)
    print("Similarity : ", round(j, 2))
    print('---------' * 10)

0it [00:00, ?it/s]

Deep learning
Title :  Speech Emotion Recognizer using Deep Learning
Similarity :  0.52
------------------------------------------------------------------------------------------
Deep learning
Title :  House Property Price Prediction System Using Deep Learning
Similarity :  0.51
------------------------------------------------------------------------------------------
Deep learning
Title :  A pneumonia detection model using deep learning
Similarity :  0.48
------------------------------------------------------------------------------------------
Deep learning
Title :  Dysarthric speech analysis and  recognition system using Deep Neural Networks
Similarity :  0.46
------------------------------------------------------------------------------------------
Deep learning
Title :  A Deep Learning Model for the Segmentation and Classification of Melanoma Skin Lesions
Similarity :  0.45
------------------------------------------------------------------------------------------


----

In [None]:
# Gradio UI

import gradio as gr

def get_similar_sentences(query, df, top_n=5):
    query_embedding = get_sentence_emd([query])
    df['similarity'] = cosine_similarity(query_embedding, df['Embeddings'].tolist())[0]
    df = df.sort_values(by=['similarity'], ascending=False)[:top_n]
    titles = df.Title.tolist()
    similarities = df.similarity.tolist()

    return titles, similarities

def get_similar_titles(query):
    l1, l2 = get_similar_sentences(query, df_bkp)
    return l1, l2

title = "Semantic Search"
description = "Search for similar titles"
article = gr.inputs.Textbox(lines=5, placeholder="Enter a query")
output = gr.outputs.Textbox(type="auto", label="Similar Titles")

gr.Interface(get_similar_titles, article, output, title=title, description=description, allow_flagging=False).launch()

