In [1]:
path = '/Users/olang/Desktop/*/Projects/NLP/Semantic Search/sampledata/titles.csv'

In [2]:
import torch
import torch.nn.functional as F

In [3]:
import pandas as pd

df = pd.read_csv(path)
df.head()

Unnamed: 0,Title
0,Security Guard Deployment System Using Vehicul...
1,A Breast Cancer Detection System using Machine...
2,Autism Spectrum Disorder prediction in childre...
3,Energy consumption prediction and scheduling ...
4,An Intelligent Chatbot for Finance and Banking


In [22]:
df.isnull().sum()

Title         0
Embeddings    0
dtype: int64

In [4]:
# sample sentences

x = df.Title.iloc[0]
y = df.Title.iloc[1]

print(x, y)

Security Guard Deployment System Using Vehicular Movement Behaviour: Case of Residential Real Estates
 A Breast Cancer Detection System using Machine learning 


In [7]:
# create embeding for each sentence, using transformer
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [8]:
sentences = [x, y]

# Tokenize sentences
sample_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

In [9]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [10]:
# get token embeddings
with torch.no_grad():             # no grad here means no backpropagation i.e. no training
    sample_embeddings = model(**sample_input)           # the two ** is to unpack the dictionary

# get sentence embeddings, through pefroming mean pooling on token embeddings.
# this is the same as the sentence embeddings from the sentence-transformers library
sample_sent_embeddings = mean_pooling(sample_embeddings, sample_input['attention_mask'])

# normalize embeddings, i.e. make them unit vectors by dividing by their L2 norm. L2 being the euclidean distance
sample_sent_embeddings = F.normalize(sample_sent_embeddings, p=2, dim=1)

In [13]:
print("Sentence embeddings:")
print(sample_sent_embeddings.shape)
#print(sample_sent_embeddings)

Sentence embeddings:
torch.Size([2, 384])


In [15]:
# enclode all as a function

def get_sentence_emd(sentence_list):
    encoded_input = tokenizer(sentence_list, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [16]:
all_embeddings = get_sentence_emd(df.Title.tolist())

In [17]:
print(all_embeddings.shape)

torch.Size([140, 384])


In [21]:
df['Embeddings'] = all_embeddings.tolist()
df.head()

Unnamed: 0,Title,Embeddings
0,Security Guard Deployment System Using Vehicul...,"[0.0831846296787262, -0.00844046100974083, -0...."
1,A Breast Cancer Detection System using Machine...,"[-0.005373380612581968, 0.006123208440840244, ..."
2,Autism Spectrum Disorder prediction in childre...,"[0.04774220287799835, -0.08992604911327362, 0...."
3,Energy consumption prediction and scheduling ...,"[-0.04396095499396324, 0.02846721187233925, 0...."
4,An Intelligent Chatbot for Finance and Banking,"[-0.04426628351211548, 0.0009629157721064985, ..."


In [24]:
df.to_csv('titles_embeddings.csv', index=False)

----

----