In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

In [2]:
#load the headlines corpus
corpus_df = pd.read_json("News_Category_Dataset_v2.json", lines=True)
corpus_df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [3]:
sentences = corpus_df["headline"].tolist()
len(sentences)

200853

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def huggingfaces_examples(sentences):
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings



In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2') # I WILL JUST USE THIS MODEL NOW FOR EMBEDDING <-- will be substittued with our best model, \
    #which hopefully be the one with the combination objeective
print(">>>>>>>>>>>>>>>>>>>>>>>>>>> print embeddings for sentences <<<<<<<<<<<<<<<<<< ")

#Our sentences we like to encode
sentences = corpus_df["headline"].tolist()
sentences = sentences[0:40000]
# sentences = ['A man is eating food.',
#           'A man is eating a piece of bread.',
#           'The girl is carrying a baby.',
#           'A man is riding a horse.',
#           'A woman is playing violin.',
#           'Two men pushed carts through the woods.',
#           'A man is riding a white horse on an enclosed ground.',
#           'A monkey is playing drums.',
#           'Someone in a gorilla costume is playing a set of drums.'
#           ]


#from hugging faces
#sentence_embeddings= huggingfaces_examples(sentences)
sentence_embeddings = model.encode(sentences) #the shapee of  embeddings is (#sentences, embedding dimension)

print(sentence_embeddings.shape)
print(sentence_embeddings[1].shape)
print(type(sentence_embeddings))
#Print the embeddings
data = []
for sentence, embedding in zip(sentences, sentence_embeddings):
    data.append([sentence,embedding])
    
df=pd.DataFrame(data=data)
df.head()


>>>>>>>>>>>>>>>>>>>>>>>>>>> print embeddings for sentences <<<<<<<<<<<<<<<<<< 
(40000, 384)
(384,)
<class 'numpy.ndarray'>


Unnamed: 0,0,1
0,There Were 2 Mass Shootings In Texas Last Week...,"[0.08234781, -0.10027518, 0.035071157, 0.03161..."
1,Will Smith Joins Diplo And Nicky Jam For The 2...,"[-0.06564419, 0.021112874, -0.013969741, -0.00..."
2,Hugh Grant Marries For The First Time At Age 57,"[0.028017538, 0.011230946, 0.04382062, 0.01243..."
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,"[-0.027169736, -0.0041060047, -0.018725948, -0..."
4,Julianna Margulies Uses Donald Trump Poop Bags...,"[0.018603941, -0.0076316213, 0.051381584, 0.02..."


In [7]:
#rename the columns
df = df.rename(columns={0: 'sentence', 1: 'embedding'})
df.head()

Unnamed: 0,sentence,embedding
0,There Were 2 Mass Shootings In Texas Last Week...,"[0.08234781, -0.10027518, 0.035071157, 0.03161..."
1,Will Smith Joins Diplo And Nicky Jam For The 2...,"[-0.06564419, 0.021112874, -0.013969741, -0.00..."
2,Hugh Grant Marries For The First Time At Age 57,"[0.028017538, 0.011230946, 0.04382062, 0.01243..."
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,"[-0.027169736, -0.0041060047, -0.018725948, -0..."
4,Julianna Margulies Uses Donald Trump Poop Bags...,"[0.018603941, -0.0076316213, 0.051381584, 0.02..."


In [10]:
searching_word = input("Please enter the word you are searching for: ")
k= int(input("Enter the number of similar sentences you are searching for (k): "))
similarities=[]
word_embedding=model.encode(searching_word)
#word_embedding=huggingfaces_examples(searching_word)

for i in range(sentence_embeddings.shape[0]):
    similarities.append(util.cos_sim(word_embedding, sentence_embeddings[i]).item())
df_temp=df
df_temp["similarity_score"]=similarities
df_temp=df_temp.sort_values(by=['similarity_score'],ascending=False)
df_temp.head()
print("The word you searched for is {}".format(searching_word))
print("The {} sentences similar to your input are".format(k))
print(df_temp.iloc[0:k]['sentence'].to_string(index=False))
    


The word you searched for is dog
The 3 sentences similar to your input are
           After All, It's Only A Dog
   This Dog Really Wants To Be A Baby
Dear Santa: Another Plea From The Dog
