In [1]:
import pandas as pd
import umap
from sentence_transformers import SentenceTransformer, util
import torch
import hdbscan
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
import time
%matplotlib inline

In [2]:
# df_text = pd.read_csv('cleaned_data.csv', usecols=['cleaned_description','lemmatized_description'], converters={'lemmatized_description': eval})
df_text = pd.read_csv('cleaned_reviews2.csv', usecols=['text','summary'])
df_text.head(5)

Unnamed: 0,text,summary
0,saltwater taffy great flavors soft chewy candy...,great just as good as the expensive brands
1,know cactus tequila unique combination ingredi...,the best hot sauce in the world
2,one boys needed lose weight put food floor chu...,my cats love this diet food better than their ...
3,cats happily eating felidae platinum two years...,my cats are not fans of the new food
4,daughter loves twizzlers shipment six pounds r...,lots of twizzlers just what you expect


In [4]:
summary = df_text['text'].to_list()

corpus, queries = train_test_split(summary, train_size=5000, test_size=50)
print(f"Train:{len(corpus)} Validate: {len(queries)}")

Train:5000 Validate: 50


In [33]:
def similarity(query_embeddings,docs_embeddings,max_n=10, top_k=5):
    cos_scores = util.pytorch_cos_sim(query_embeddings, docs_embeddings)[0]
    cos_scores = cos_scores.cpu()
    top_results = torch.topk(cos_scores, k=max_n)
    return zip(top_results[0], top_results[1])

In [34]:
def get_query_top_k(query, query_embeddings, docs, docs_embeddings, max_n=10, top_k=5, min_p=0.7, exact_match=True):
    count=0
    top_k_list = []
    for score, idx in similarity(query_embeddings, docs_embeddings, max_n=max_n, top_k=top_k):
        score = score.item()
        if count<top_k and ((score>min_p and exact_match) or (score<=0.99 and score>min_p)): # we skip exact match if so required, because of floating point precision we set exact match to 0.99
            count=count+1
            top_k_list.append({"query":query,"sentence":docs[idx],"score":score})
    return top_k_list

In [14]:
def sent_similarity(corpus, query, threshold=0.8, exact_match=True):
    start = time.time()
    embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    n = 10
    top_k = 5
    
    data = []
    for query in queries:
        query_embeddings = embedder.encode(query, convert_to_tensor=True)
        top_k_list = get_query_top_k(query, query_embeddings, corpus, corpus_embeddings, max_n = n, top_k = top_k, min_p=threshold, exact_match=exact_match)
        data.extend(top_k_list)
    
    df = pd.DataFrame(data)
#     df.to_csv(output_file,index=False,sep="\t")
    end = time.time()
    ex_time = int(end - start)
    print(f'execution time {ex_time}')
    return df, embedder, corpus_embeddings

In [15]:
df, embedder, corpus_embeddings = sent_similarity(corpus, queries, 0.8, True)

execution time 21


In [16]:
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,query,sentence,score
0,could get fact opened first filled halfway water cans small shrimp companys time drain least empty really packed top tiny cleaned shrimp taste great make really great salad addition little lettuce onions celery mayo course price little since get cans case amazon sale worth buying highly recommend,review refers amy organic soup low sodium minestronei always searching something tasty low sodium foods started low sodium several years back husbands doctor insisted added everyone low sodium quite adjustment first salt definitely th food group adjust thing trying find new different things could find neighborhood market amy low sodium minestrone taste bland used low sodium products doctor maybe little pepper may enjoy much already low sodium happy find something new flavorful buying campbell healthy request minestrone little sodium difference quite noticeable texture amy watery like home made veggies seem like real veggies campbell packed veggies good portion pasta well little bread makes great lunch servings per servings pretty generous size benefits soup beside low sodium really nutritious made al vegan organic ingredients low fat cholesterol heat serve good source vitamins iron benefit well unlike condensed soups water added also marked kosher ingredients filtered water organic tomatoes organic onions organic carrots organic kidney beans organic potatoes organic celery organic pasta organic green beans organic peas organic leeks pure herbs spices organic high oleic safflower sunflower oil sea salt organic garlic contain wheat facility also processes milk soy tree nuts seed products however use peanuts fish shellfish eggs sodium per serving mg dietary fiber gm sugar protein total fat way order shipped arrived promptly remember breathe eat healthy enjoy laurie,0.802478
1,best pancake mix celiac children like food usually begg taste things spit made chocolate chip cookies mix came home school even told said could tell made pancakes ate gone far best tasting mix,tried recommended store employee favorite gf pancake mix bought ever since gluten free new family daughter diagnosed allergy gluten since hunt works us sadly lot products pancake waffle mix however enthusiastic hit large family even skeptical non gf members family love make crepes truly come perfect every time mix also bolstered lot additional nutrition find gf mixes perfect sweetness batter considering even need syrup additional toppings,0.81938
2,high quality pure marzipan grade product need candy coating sugar top flavor rich best way eat small bites end eating four loaf one sitting big hurry love marzipan owe least give one try odds ordering regular basis sometimes amazon prime get get much though reading nutritional information might give heart palpitations,taste newton fruit thins pretty good nice crunchy treat really enjoyed cranberry citrus oat flavor like really thin crisp designer oatmeal cookie would liked product cookies smaller size snacks rather large round shape thin easily break crumble current size could easily consume one two bites eliminate crumbs packaging leaves lot desired outer foil bag bad inside plastic tray clumsy tough slide back resealable wrap combined large crumbly nature cookie already mentioned doubt much anyone buying store ends half cookies actually whole time get home ordering amazon easily guarantees half product inside pieces like said smaller size makes em easier eat,0.804243
3,high quality pure marzipan grade product need candy coating sugar top flavor rich best way eat small bites end eating four loaf one sitting big hurry love marzipan owe least give one try odds ordering regular basis sometimes amazon prime get get much though reading nutritional information might give heart palpitations,huge fan chocolate covered pretzels ms favorite candy saw product pretty much melted puddle giddy anticipation pretzel good crunchy sweet light low fat large ms grams fat problem largely made pretzel tiny barely coating chocolate surrounding finished thick coating candy shell basically like candy coated pretzel much chocolate less average chocolate covered pretzel also pretzel salted much really like eating candy coated unsalted pretzel never fan jordan almonds remind lot jordan almonds pretzel part smaller saltier chocolate would best candy ever invented right fine three four time enough fix,0.801557
4,high quality pure marzipan grade product need candy coating sugar top flavor rich best way eat small bites end eating four loaf one sitting big hurry love marzipan owe least give one try odds ordering regular basis sometimes amazon prime get get much though reading nutritional information might give heart palpitations,yeah yeah know high sugar compared healthy cardboard tasting cereals sweet delicious filled fiber fan plain old raisin bran find crunch little clusters really add another element also tried raisin bran extra yellow box cranberries jazz little much excitement morning really cannot go wrong price amazon enough fiber give plenty time catch reading bathroom morning break work,0.801473
5,drinking decaf green tea daily decade several brands started drinking republic tea people green tea decaf little two years ago really love bitter green tea yet strong enough taste like something something good give try bet like,best green tea market opinion sweet right amount honey hands favorite pre made green tea brewed,0.803288
6,used favors daughter circus birthday party agree reviews cookies seem smaller remember kid last time look nutritional value happy get small cookies versus large cookies arrived good condition reasonable amount broken expected even store seals back bags taped help prevent coming open tasted great everyone kids adults happy see treat order since cannot find big bags store shelves,cookies good chewy great flavor next time may try baking slightly shorter time recommended mine little brown around edges otherwise perfect son thought fantastic disappointed day made finished could stop missing good fresh cookies years noticed negative reviews people tried products shockingly even one someone plugging gluten free bakery hypocritically criticizing others getting gluten free bandwagon since bucks made well turns perhaps larger manufacturer research money testing facilities get taste right mix better nearly specialty mixes tried sad many pans cakes brownies blondies cookies tossed garbage inedible thank betty crocker making high quality gluten free mix available regular grocery store,0.803766
7,ever freeze dried ice cream stuff size treat large definitely enough enjoy bar individually wrapped also lots fun give anyone know freeze dried ice cream pretty straight forward water removed normal ice cream special way first ice cream kept frozen vacuum applied vacuum causes water slowly sublimate vaporize vaporized water eventually sucked left chunk ice cream everything water note melt process brings water straight solid gas technique used lots foods makes food last forever without water bacteria molds simply cannot grow hippie feel safe process use crazy chemical science kill mojo,new click love learned mix hours earlier let sit either counter refrigerator get smoother drink using shaker wire ball second tip carbs counting good news half recipe actually good size one scoopful one cup water poured ice perfect afternoon pick necessary make scoop version unless hungry havent used blender yet still pouring ice one day make frappe pleased see dry powder fully blends water let set spell course also get twice many drinks train smaller one,0.806771
8,twin girls born weeks around pounds three weeks breastfeeding exclusively dropped pounds pediatrician recommended supplement breastfeeding neosure results astounding within month girls pounds calorie laden stuff ever since reason give full five stars gave baby girls touch reflux combat ml mylanta feeding also little difficult mix formulas since creamier fattier need really shake mix sure little beads formula settle bottom bottle yes tad pricier formulas however like reviewers found little less cvs even snapped sale wegman though made girls big strong even months yet already sitting rolling trying stand laps talking storm,used lansinoh bags first son three week ago five month old one day husband lost three bags milk due leaks sitter issues ever since lansinoh repackaged bags nothing problems working breastfeeding mom understands defeat bag milk go waste shop amazon pretty much everything saw new product figured would give try bags arrived super fast first glance bags seem reinforcement quite smitten zipper well much higher quality seal lansinoh also seemed fewer air bubbles pouring milk bag negatives bag readily available case run would like space volume know moms like name child breastfed baby daycare however easy enough write oz space instead overall seems like great alternative competitors high hopes bag,0.824615
9,truly enjoy taste aroma delicious cup selection indeed remind special mornings childhood mother would make french toast serve family warm maple syrup side coffee medium bodied bitter taste many flavored coffees seem leave much like fact fair trade certified means coffee grower paid living wage crop heartily recommend cup flavor course taste highly subjective especially comes coffees doctor unique way might suggest read many reviews possible considering unfamiliar variety base purchasing decision general consensus rather one review enjoy,okay ever buy instant oatmeal like buy bulk organic quick oats add brown sugar always found instant oatmeal little packs good flavor despite additives always want add sugar make taste better part oatmeal lived expectations admit compared maple flavored brands tried one better still would buy one daughters said like eat glad got one box use genuine maple syrup household even keep maple sugar sprinkling toast big maple fans guess genuine maple fans artificial flavor cut even felt like quite sweet enough even though sugar cinnamon spice flavor oatmeal better flavor brands maple flavored oatmeal tried gave stars saying ok right,0.802496


In [17]:
# Get top 5 for random query

In [38]:
def get_random_sim(query, embedder, corpus_embeddings):
    query_embeddings = embedder.encode(query, convert_to_tensor=True)
    top_k_list = get_query_top_k(query, query_embeddings, corpus, corpus_embeddings,
                                 max_n = 10, top_k = 5, min_p=0.65, exact_match=True)
    return pd.DataFrame(top_k_list)

In [40]:
get_random_sim(summary[1201], embedder, corpus_embeddings)

Unnamed: 0,query,sentence,score
0,like love black licorice seriously recommend product eat love black licorice recently found observed reported nuitionists well evidence black licorice aid constipation comes flavors medicinal effects come black,write review raspberry licorice completing review original black variety starters craving something fruity chewy love sweet slightly tangy flavor raspberries licorice well soft texture easily chew satisfying fat free guilt keep reach high shelf consume,0.779617
1,like love black licorice seriously recommend product eat love black licorice recently found observed reported nuitionists well evidence black licorice aid constipation comes flavors medicinal effects come black,found good substitutions gluten filled foods exception licorice love although expensive,0.763085
2,like love black licorice seriously recommend product eat love black licorice recently found observed reported nuitionists well evidence black licorice aid constipation comes flavors medicinal effects come black,find drink anything special terms taste black cherry juice infused carbonation gave odd taste black cherries favorite fruit however tried drink make positive impression juice quite acidic feel fruit juice drink comes cans rather glass bottle would helped drink maintain juicy taste longer preservatives gluten free sugar artificial flavors drink special features stand well market crowded many drinks minute maid mixed berry juice fl oz bottles fl oz cal valls certified organic juices clementine igourmet com tang sugar free orange drink mix ounce units crystal light classic orange ounce unit nesbitts california variety pack,0.74429
3,like love black licorice seriously recommend product eat love black licorice recently found observed reported nuitionists well evidence black licorice aid constipation comes flavors medicinal effects come black,wonderful tea really supresses appetite love packaging also would highly recommend weight loss tea tried many claim one garcinia actually works supress appetite great tasting also tasted many could stand,0.741829
4,like love black licorice seriously recommend product eat love black licorice recently found observed reported nuitionists well evidence black licorice aid constipation comes flavors medicinal effects come black,surprised find black tea licorice usually licorice reserved herbal teas mix ginseng peppermint licorice unusual taste treat one would recommend others mint strong licorice sweet normally like ginseng teas ginseng stays back brew noticeable yet sure still get health benefits,0.734146


In [43]:
get_random_sim(summary[18774], embedder, corpus_embeddings)

Unnamed: 0,query,sentence,score
0,opened first box gold kili ginger brewing bags took sniff wow strong scent ginger told treat unsweetened brewing bags make fabulous ginger tea whatever else want add kick ginger like best ginger lingering afterburn may repeat later burp excellent product ginger lover,always hunt ginger tea spicy punch ginger teas overwhelmed flavors ingredients one however pure ginger little bit honey heaven like double extra kick delicious drink cups day highly recommend product like ginger time brew ginger tea scratch,0.897244
1,opened first box gold kili ginger brewing bags took sniff wow strong scent ginger told treat unsweetened brewing bags make fabulous ginger tea whatever else want add kick ginger like best ginger lingering afterburn may repeat later burp excellent product ginger lover,tasty spicy beverage may cup tea speak smell ginger strong opening pouch taste rather strong piquant many people used strong ginger flavor may putting even unpleasant idea ginger stops canada dry ginger ale probably like product grew ginger foods love closest thing compare indonesian ting ting jehe chewy ginger candy basically liquid version candy sweet spicy tickles throat going juiced fresh ginger root tastes like ginger infused juice made pungency diminished manufacturing served hot perfect sick often make ginger tea honey weather convenient way tasty hot beverage ginger honey used thousands years throughout world health medicinal benefits lemon flavor could detect however,0.873972
2,opened first box gold kili ginger brewing bags took sniff wow strong scent ginger told treat unsweetened brewing bags make fabulous ginger tea whatever else want add kick ginger like best ginger lingering afterburn may repeat later burp excellent product ginger lover,new runa taste consistent great balance ginger citrus kick also side effects somewhat get teas made great summer tea well,0.824901
3,opened first box gold kili ginger brewing bags took sniff wow strong scent ginger told treat unsweetened brewing bags make fabulous ginger tea whatever else want add kick ginger like best ginger lingering afterburn may repeat later burp excellent product ginger lover,really nice aroma opened packet mixture lemon ginger taste lemon particularly hold mouth moment ginger taste really prevalent flavor kick get eat fresh pickled ginger enjoy drink weird unpleasant aftertaste would like something like actual product probably would make grocery cart,0.803757
4,opened first box gold kili ginger brewing bags took sniff wow strong scent ginger told treat unsweetened brewing bags make fabulous ginger tea whatever else want add kick ginger like best ginger lingering afterburn may repeat later burp excellent product ginger lover,galanga galangal root spice similar ginger flavor appearance flavor different complexity intensity ginger root much robust woody ginger whereas easy slice ginger eat root something would eat outright recommendation order warm hot season without expedited shipping stars product arrived small amount decay mold root two large rootstocks package sealed plastic bag would benefited ventilation sealed condensation formed wetness constant contact root combined warmth spring caused small amount decay mold occur small amount removed without much difficulty one root cooked growing garden recommend using something akin bone saw cut rootstock quite woody,0.785466
