In [41]:
import pandas as pd
import umap
from sentence_transformers import SentenceTransformer, util
import torch
import hdbscan
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
import time
%matplotlib inline

In [24]:
# df_text = pd.read_csv('cleaned_data.csv', usecols=['cleaned_description','lemmatized_description'], converters={'lemmatized_description': eval})
df_text = pd.read_csv('cleaned_reviews2.csv', usecols=['text','summary'])
df_text.head(5)

Unnamed: 0,text,summary
0,saltwater taffy great flavors soft chewy candy...,great just as good as the expensive brands
1,know cactus tequila unique combination ingredi...,the best hot sauce in the world
2,one boys needed lose weight put food floor chu...,my cats love this diet food better than their ...
3,cats happily eating felidae platinum two years...,my cats are not fans of the new food
4,daughter loves twizzlers shipment six pounds r...,lots of twizzlers just what you expect


In [1]:
# get summary above word len 7

In [50]:
summary = df_text['summary'].to_list()

corpus, queries = train_test_split(summary, train_size=5000, test_size=50)
print(f"Train:{len(summary_train)} Validate: {len(summary_validate)}")

Train:5000 Validate: 50


In [51]:
def similarity(query_embeddings,docs_embeddings,max_n=10, top_k=5):
    cos_scores = util.pytorch_cos_sim(query_embeddings, docs_embeddings)[0]
    cos_scores = cos_scores.cpu()
    top_results = torch.topk(cos_scores, k=max_n)
    return zip(top_results[0], top_results[1])

In [52]:
def get_query_top_k(query, query_embeddings, docs, docs_embeddings, max_n=10, top_k=5, min_p=0.7, exact_match=True):
    count=0
    top_k_list = []
    for score, idx in similarity(query_embeddings, docs_embeddings, max_n=max_n, top_k=top_k):
        score = score.item()
        if count<top_k and ((score>min_p and exact_match) or (score<=0.99 and score>min_p)): # we skip exact match if so required, because of floating point precision we set exact match to 0.99
            count=count+1
            top_k_list.append({"query":query,"sentence":docs[idx],"score":score})
    return top_k_list

In [35]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [None]:
embeddings = model.encode(summary, convert_to_tensor=True, show_progress_bar=True)

In [54]:
def sent_similarity(corpus, queries, threshold=0.8, exact_match=True):
    start = time.time()
    embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    n = 10
    top_k = 5
    
    data = []
    for query in queries:
        query_embeddings = embedder.encode(query, convert_to_tensor=True)
        top_k_list = get_query_top_k(query, query_embeddings, corpus, corpus_embeddings, max_n = n, top_k = top_k, min_p=threshold, exact_match=exact_match)
        data.extend(top_k_list)
    
    df = pd.DataFrame(data)
#     df.to_csv(output_file,index=False,sep="\t")
    end = time.time()
    ex_time = int(end - start)
    print(f'execution time {ex_time}')
    return df

In [55]:
df = sent_similarity(corpus, queries, 0.8, True)
df

execution time 7


Unnamed: 0,query,sentence,score
0,as good as in europe,as good as the best have had in europe,0.91245
1,as good as in europe,this is amazing product from europe,0.850032
2,helpful but not quite miracle,pretty good but not amazing,0.851143
3,helpful but not quite miracle,nice but not very tasty,0.834629
4,helpful but not quite miracle,very good but not enough,0.83007
5,helpful but not quite miracle,delicious but not good gift,0.828679
6,helpful but not quite miracle,good but not that good,0.814119
7,great bargain on great candy,damn good candy if say so,0.867343
8,great bargain on great candy,great candy and great service,0.854182
9,great bargain on great candy,one of the best candy ever,0.833833
