# Imports

In [None]:
!pip list | grep scipy 

In [100]:
# torch                              2.2.0
# transformers                       4.46.3
# Levenshtein                        0.25.1
# pandas                             1.4.4
# numpy                              1.21.2
# scipy                              1.8.0
import torch 
import transformers
import Levenshtein
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.spatial.distance import cdist

In [101]:
# seed everything
seed = 21
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7febf2c35270>

# Data

In [102]:
df_ = pd.read_csv("walmart.csv")

In [103]:
df = df_[['Uniq Id', 'Description']]
df = df.loc[df['Description'].notna()]
df = df.loc[df['Description'].notnull()]
# all descriptions had same prefix (up to 162 symbol)
df["Description"] = df["Description"].apply(lambda x: x[162:])
df_small =  df[:4000]

# Basline model

In [104]:
# simply search closest products by Levenshtein distance
def get_n_rank(query, products, N = 10):
    products["Dist"] = products["Description"].apply(lambda x: Levenshtein.distance(query, x))
    return products.sort_values(by = "Dist")[:N]['Uniq Id']
    

In [105]:
query = "cookie with chocolate"
result = get_n_rank(query, df_small, N = 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products["Dist"] = products["Description"].apply(lambda x: Levenshtein.distance(query, x))


In [106]:
# awfull data...
df_small[df_small["Uniq Id"].isin(result) ].sort_values(by = "Dist")

Unnamed: 0,Uniq Id,Description,Dist
1400,d842ffaadb2592f28c89269b2e43da8a,KM Mask Wraps,18
1968,03c0a3b2eb1f0a0abc6a54ca6341ad42,Eye black. Eye Black,18
3464,16b13b5d4249b5028be1b5b959f4f3bd,"Insect Killer, 80 Watt",18
177,87fb745148514a09965b4569fee9b2b0,new,19
1841,6190f498f0e431cf62f783a12d131f5e,new,19
1942,f7d06ec763a9348bb7f8bad91f961e98,tit,19
3027,b09b7c670681ddbf1eea9ce5a25b6c5c,new,19
3089,6108ad3cd818aff45f8025a52b467b08,"Official is 6""x24"" 0",19
3182,682a3ded1ab1559575c7ffa7e7288faf,Promax Brake Rear Red.,19
3826,1f518b6fa64b9a77b57cac76a029653a,Psyllium Whole Husks,19


# ML model

In [107]:
# https://huggingface.co/docs/transformers/model_doc/distilbert
# we desided to use that particular model due to its small sizes

from transformers import AutoTokenizer, DistilBertModel
import torch

class Model:
    def __init__(self, product_DB: pd.DataFrame, to_set_up: bool = False, path_to_vector_BD: str = None):

        # gona instantiate once with special flag
        # or method -  to pervent accidental start
        # of such a heavy process
        self.model = None
        self.tokenizer = None

        # np.Array of pre-computed vectores
        # representing products
        self.vector_DB = None

        # Download from .npy file
        if path_to_vector_BD:
            print("Downloading Vector BD")
            self.vector_DB = np.load(path_to_vector_BD)

        # pandas dataframe with columns "Description" and "Uniq Id"
        self.products_DB = product_DB.sort_values(by = "Uniq Id")
            
        # start model and tokenizer downloading and 
        # instantiating 
        if to_set_up:
            self.set_up()
        
    def set_up(self, max_position_embeddings = 512):
        #  model and tokenizer downloading and instantiating 
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.model = DistilBertModel.from_pretrained("distilbert-base-uncased", max_position_embeddings = max_position_embeddings)

    def calc_embeding(self, X: str):
        # calulate embeding for one user quary
        if not self.model:
            print("U should set_up() nodel first!")
            return 
        inputs = self.tokenizer(X[:512], return_tensors="pt")
        return self.model(**inputs).last_hidden_state[:,-1,:].detach().numpy()
        
    
    def form_vector_DB(self):
        # forms vector embedings of products
        # self.products_DB (if not provided)
        
        if self.vector_DB is not None:
            print("Vector DB already exists!")
            return
            
        self.vector_DB = np.stack(
            tuple( self.calc_embeding(row["Description"]) for _, row in tqdm(self.products_DB.sort_values(by = "Uniq Id").iterrows(), total = len(self.products_DB))
            )
        )
        
    def get_n_rank(self, query, n = 10):
        # get top n closest simanticly products ID's
        
        if self.vector_DB is None:
            print("No Vector DB exists!")
            return 
            
        if not self.model:
            print("U should set_up() nodel first!")
            return 
            
        query_v = self.calc_embeding(query)
        top_indx = cdist(query_v, self.vector_DB, 'cosine')[0].argsort()[:n] # top most simular (with smallar distance)
        out_uniq_id = self.products_DB.sort_values(by = "Uniq Id").iloc[top_indx]["Uniq Id"]
        return out_uniq_id

In [108]:
# init vodel
m = Model(df_small, path_to_vector_BD = "vector_BD.npy", to_set_up = True)

Downloading Vector BD


In [109]:
# quary test sample
query = "cookie with chocolate"
query_v = m.calc_embeding(query)

In [110]:
# top 10 most related products
df_small[df_small["Uniq Id"].isin(m.get_n_rank( query, 10))]

Unnamed: 0,Uniq Id,Description,Dist
458,01051b12407b3eaddcaa7fd46c5e69b1,Warm Cinnamon Roll Dip Mix,24
850,77a82b87b862a17578ce534c82a53104,CHOCOLATE CARNIVOR SHRED CHOCOLATE 28/S,37
1259,b3d3a3747f124e7e2514be3fcc7004c3,Suzo Happ Pool Table Cloth Felt Cleaner,29
1265,5e995e826d5039642e757b35343056c2,Toy Story 4 Color Twist Bath Bomb Sheriff Cott...,132
2055,0d24fcaf5be6782e707c325395dfe4e2,Food club peanut butter crunchy Crunchy Peanut...,41
2063,1b7094717e52c708870a9f15c4f0bf93,Greenmax - Rice & Peanut Milk with Brown Sugar...,47
2345,c045e4d36625067483ecebeae97dc671,Chifles Sweet Plantain Chips 9 oz,27
2370,66a18e6925b9ad8aa73f02526590541b,"Food Club, Petite Diced Tomatoes",21
3937,f5c9eaf3f8df57c87388b14e7e5a0215,"Plains Dairy Premium Chocolate Milk, 1 Pint",31
3943,ea85d77c98c88478e28cf845085dfa44,"Sweets Candy Milk Chocolate Sticks, Raspberry,...",44
