## Data prep

In [1]:
# load data
import json
with open('data.json', 'r') as file:
    data = file.read().replace('\xa0', ' ')
    
data=json.loads(data)

In [2]:
# basic json transformations to df
import pandas as pd

df=pd.json_normalize(data).transpose()

df=df[df.index.str.contains('children')]
df=df.reset_index()
df.columns=['big_category','nice']

def rep(x):
    return x.split('.')[0]
df.big_category=df.big_category.apply(rep)

In [3]:
# expand the jsons - one layer further
df1=pd.concat([df.big_category, pd.json_normalize(df.nice)],axis=1)

df1.index=df1.big_category
df1.drop(columns='big_category',inplace=True)

## Extract info from JSON

In [4]:
# our uncleaned corpus to list of jsons
import numpy as np
listy=df1.values.reshape(-1)
listy=listy[listy != np.array(None)]

In [5]:
# expand the list of jsons 2 layers deeper
pre_final_corpus = dict()

for i in listy:
    yeboy=pd.json_normalize(i,'children', meta='title',meta_prefix='ll_')
    for j in yeboy.title:
        # dict of jsons
        pre_final_corpus[yeboy[yeboy.title==j].title.values[0]] = yeboy[yeboy.title==j].values.tolist()

In [6]:
# list-catalouge will be used to enrich the user query results with parameters
product_info = []

# extract parameters for each (sub)category
for i in pre_final_corpus.keys():

    try:
        # we deal with the hirerachy granularity (depth)
        if 'tag' in str(pre_final_corpus[i][0]):

            objects = pre_final_corpus[i][0][3]
            bff=pd.json_normalize(pre_final_corpus[i][0][3], 'tags', meta='title', meta_prefix='sub_')

            # remove empty rows
            bff=bff[bff.children.apply(len)>0]

            # add higher level item categories
            bff['ob_class']=i

            # add to list
            product_info.append(bff)

        else:
            objects = pre_final_corpus[i][0][2]

            # extract from json
            bff=pd.json_normalize(objects)

            # remove empty rows
            bff=bff[bff.children.apply(len)>0]

            # add higher level item categories
            bff['sub_title']=i
            bff['ob_class']=i

            # add to list
            product_info.append(bff)
    except:
        # add to dict
        product_info.append(pd.DataFrame({'title':[''], 'children':[''], 
                                            'sub_title':[i], 'ob_class':[i]}))
    # drop the interm. df
    bff=None
    
# final dataframe with all the lists
product_info = pd.concat(product_info)
product_info.loc[product_info.title.apply(str).str.contains('Manufacturer'), 'title']='Manufacturer'

# concat columns (str) to construct the bag of words
product_info['corpus']=product_info.title + ' ' + product_info.children.apply(str)\
                        + ' ' + product_info.sub_title  + ' ' + product_info.ob_class

product_info.head()

Unnamed: 0,title,children,sub_title,ob_class,corpus
0,,,Ride-on sweeper,Ride-on sweeper,Ride-on sweeper Ride-on sweeper
0,Manufacturer,"[{'title': 'Draper (8)'}, {'title': 'Kaiser+Kr...",Street broom,Street broom,"Manufacturer [{'title': 'Draper (8)'}, {'title..."
1,Broom width,"[{'title': '30 cm (8)'}, {'title': '33 cm (4)'...",Street broom,Street broom,"Broom width [{'title': '30 cm (8)'}, {'title':..."
2,Bristle material,"[{'title': 'Coco (7)'}, {'title': 'Piassava (1...",Street broom,Street broom,"Bristle material [{'title': 'Coco (7)'}, {'tit..."
3,Handle system,"[{'title': 'Thread (2)'}, {'title': 'n/a (48)'}]",Street broom,Street broom,"Handle system [{'title': 'Thread (2)'}, {'titl..."


## NLP: data pre-processing

In [7]:
# import libs and prepare transformators for data preprossessing
import string
from string import digits
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.metrics.distance import edit_distance
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
# import re
# import nltk
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')

lemmatizer = WordNetLemmatizer()

stop_words = stopwords.words('english')
stop_words.extend(['manufacturer', 'cm', 'l', 'm', 'kg', 'g', 'w', 'r', 'ø',
                    'brand', 'new', 'something','tag', 'description','material','width','weight',
                    'volume','na', 'content', 'no','mm', 'piece', 'ml', 'µf', 'µh', 'µm', 'μf'])

spcial_char_map = {ord('ä'):'a', ord('ü'):'u', ord('ö'):'o', ord('ß'):'s'}

In [73]:
# mega-function that cleans up the corpus: tokenizes, lemmatizes, removes verbs, etc.

def krasavchik(jj):

    #safe the category name
    if "'title':" in str(jj):
        
        # collapse to string
        jj=str(jj).replace("'children':",'').replace("'title':",'').replace("_",' ')
        
    else:
        jj=str(jj).replace("_",' ')
        
    #remove parentheces
    # jj=re.sub(r'\([^)]*\)', '', jj)

    #remove punctuation and to lower
    jj=jj.translate(str.maketrans('', '', string.punctuation)).lower()

    #remove numeric values
    jj = jj.translate(str.maketrans('', '', digits))

    #replace umlauds with english alternatives
    jj=jj.translate(spcial_char_map)

    #tokenize
    jj=word_tokenize(jj)

    #lemmatize
    jj=list(map(lemmatizer.lemmatize, jj))

    #REMOVE verbs
    jj=pd.DataFrame(pos_tag(jj))
    jj.columns=['words','part']
    jj=jj[~jj.part.isin(['VBD','VBN','VBP','VBZ'])]

    #remove empty values
    # jj=list(filter(('').__ne__, jj))

    #remove stopwords and duplicates (we will not account for frequencies due to nature of quering)
    jj=jj.drop_duplicates(subset=['words'])
    jj=jj[~jj.words.isin(stop_words)].words.tolist()

    return jj

In [9]:
# # categories that we have 
# categories = krasavchik(final.names.tolist())

## Manufacturers catalog

In [11]:
# get just manufacturers from the catalog
manufacturers = product_info[product_info.title.isin(['Manufacturer', None])].drop(columns='title')

# get list of manufacturers for each product
import re
def transformy(jj):
    # if no info on manufacturers
    if jj is None:
        return 'no info on manufacturers'
    # collapse to string
    jj=str(jj).replace("'children':",'').replace("{'title':",'')
    # remove values between parentheces
    jj=re.sub(r'\([^)]*\)', '', jj)
    # remove punctuation
    jj=jj.translate(str.maketrans('', '', string.punctuation.replace(',','')))
    
    # return list of manufacturers in original spelling
    return " ".join(jj.strip().split()).split(' , ')

# get list of manufacturers 
manufacturers['manufacturers']=manufacturers.children.apply(transformy)

# cleaned manufacturers 
manufacturers['cleaned']=manufacturers.children.apply(krasavchik)

manufacturers.head()

Unnamed: 0,children,sub_title,ob_class,corpus,manufacturers,cleaned
0,"[{'title': 'Draper (8)'}, {'title': 'Kaiser+Kr...",Street broom,Street broom,"Manufacturer [{'title': 'Draper (8)'}, {'title...","[Draper, KaiserKraft, RS Pro, Sealey, Vikan, W...","[draper, kaiserkraft, pro, sealey, vikan, wurth]"
0,"[{'title': 'Apple (8)'}, {'title': 'Black & De...",Cordless vacuum cleaner,Vacuum cleaner,"Manufacturer [{'title': 'Apple (8)'}, {'title'...","[Apple, Black Decker, Kärcher, Makita, Milwauk...","[apple, black, decker, karcher, makita, milwau..."
70,"[{'title': 'Draper (19)'}, {'title': 'Electros...",Bagless vacuum cleaner,Vacuum cleaner,"Manufacturer [{'title': 'Draper (19)'}, {'titl...","[Draper, Electrostar, Karcher, Kärcher, Sealey...","[draper, electrostar, karcher, sealey, slingsby]"
74,"[{'title': 'Draper (17)'}, {'title': 'Electros...",Floor vacuum cleaner,Vacuum cleaner,"Manufacturer [{'title': 'Draper (17)'}, {'titl...","[Draper, Electrostar, Karcher, Kärcher, Sealey...","[draper, electrostar, karcher, sealey, slingsby]"
78,"[{'title': 'Bosch (1)'}, {'title': 'Coreparts ...",Upright vacuum cleaner,Vacuum cleaner,"Manufacturer [{'title': 'Bosch (1)'}, {'title'...","[Bosch, Coreparts, DEERMA Malaysia, Karcher, K...","[bosch, coreparts, malaysia, karcher, slingsby]"


In [12]:
# list of unique manufacturers in our DB
uni_manufacturers = list(set(manufacturers['cleaned'].values.sum()))
uni_manufacturers[0:3]

['sip', 'online', 'icidu']

In [13]:
# find synonyms for each word in corpus (except brand names!) to enrich the corpus
#  - we have no discriptions in original set!
def synomizer(x, max_n_synonyms=3, max_depth=4):

    # we will not try to find a syns for brand names
    if x in uni_manufacturers:
        return x
    try:
        j = wn.synsets(x)[0].lemma_names()

        if len(j)==1:
            for i in range(1,max_depth):
                j = wn.synsets(x)[i].lemma_names()
                if len(j)>1:
                    break

        # # remove words that appear in category names - since we risk bluring the categorization
        # j = [e for e in j if e not in categories]    

        return krasavchik(j[0:max_n_synonyms] + [x])

    except:
        return x

print(synomizer('cleaning'))

# Function to convert list of words to string
def listToString(s):
    s=str(s)
    s=s.translate(str.maketrans('', '', string.punctuation))
    return s 

['cleaning', 'cleansing', 'cleanup']


## Our corpus

In [14]:
# Our corpus

final=product_info[['sub_title', 'corpus']].groupby(['sub_title'], as_index = False).agg({'corpus': ' '.join})
final.corpus = final.corpus.apply(krasavchik)
final.columns=['names', 'corpus']

# final=pd.DataFrame({'names': pre_final_corpus.keys(), 
#                     'corpus' : list(map(krasavchik,pre_final_corpus.items())) } )

final.head()

Unnamed: 0,names,corpus
0,1-ear hose clamps,"[prevost, hose, clamp, bracket, clamping, range]"
1,10GBase-CX4 network card,"[hp, ibm, intel, qnap, acc, spare, part, non, ..."
2,19 inch document drawer,"[allnet, efbelektronik, ic, intracom, intellin..."
3,19-inch mounting kit,"[apc, cisco, fujitsu, hp, lenovo, vertiv, inch..."
4,19-inch power strip,"[apc, dell, hp, pduex, power, data, tripp, lit..."


In [15]:
# let's enrich our corpus with synonyms, since the items have no descriptions to be used in training
final['corpus_with_syn']=final.corpus.apply(lambda x: list(map(synomizer,x)))

In [16]:
# add appropriate string for dtm
final['corpus_me']=final['corpus_with_syn'].apply(listToString)

# remove limited categories
final = final[(final.corpus.map(len)>4)|(~final.names.isnull())]

In [17]:
# finally we are ready to train the classifier!
final[['names','corpus_me']].head()

Unnamed: 0,names,corpus_me
0,1-ear hose clamps,prevost hosiery hose clamp clinch bracket angl...
1,10GBase-CX4 network card,hp ibm intel qnap acc spare part non ei hw gba...
2,19 inch document drawer,allnet efbelektronik ic intracom intellinet st...
3,19-inch mounting kit,apc cisco fujitsu hp lenovo vertiv inch climb ...
4,19-inch power strip,apc dell hp pduex power data tripp lite inch s...


## DTM - my approach: classifier trained on tf-idf bag of words + fuzzy matching

In [18]:
# DTM - binary + tfidf (to account for doc size), since we don't care about frequencies - no descriptions!!!
from sklearn.feature_extraction.text import CountVectorizer #, TfidfVectorizer
vectorizer = CountVectorizer(binary=True)
count_array = vectorizer.fit_transform(final.corpus_me.tolist()).toarray()
dtm = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())

# our classes
dtm.index=final.names

dtm

Unnamed: 0_level_0,aa,aaa,aaaa,aaron,ab,abb,abbatron,abc,abdominal,abekp,...,zinc,zincair,zip,zipper,zirconia,zirconium,zmorph,zn,zone,zoom
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-ear hose clamps,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10GBase-CX4 network card,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19 inch document drawer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19-inch mounting kit,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19-inch power strip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XLR wall socket,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
XQD card,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Xbox,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Z rail,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train simple Multinomial Naive Bayes classifier - no testing =(

In [19]:
# our model
from sklearn.naive_bayes import MultinomialNB
mini_model = MultinomialNB()
mini_model.fit(dtm, dtm.index)

MultinomialNB()

## Final quering function - our mini item-recomendation system

In [59]:
# pridicts the product category based on query
def final_quering(
            query, 
            distince_thresh = 4, # threshold for levinshtein distance
            matching_by_first_letter = True, # apply fuzzy matching only for the words that start from the same letter - major efficency boost
            certanty_thresh = 0.9, # threshold for our certanty in top 1 prediction - i.e. "credible region" we set to 95% from top proba
                                   # (set certanty_thresh to None if ypu want to load the top 1 recomendation)
            show_more = False # show more info on the results
            ):
            
    # if empty string is passed
    if query=='':
        return ''

    # clean up the query
    needed=krasavchik(query)

    # strict match
    all_features=pd.DataFrame({'words':mini_model.feature_names_in_})
    we_have_it=all_features[all_features.words.isin(needed)].words.tolist()

    # list of words that dont have a match
    unmatched = list(set(needed)-set(we_have_it))
    
    # final query dtm
    final_dataframe=pd.DataFrame(0, index=np.arange(1), columns=mini_model.feature_names_in_)
    
    # fuzzy match: I need to vectorize that search - may be later...

    if len(unmatched)>0:
        
        for i in unmatched:

            # list to store the distances at each iteration
            levin=[]

            # match only the one that starts with the same letter (people dont mistake their first letter)
            if matching_by_first_letter is True:
                # for faster perfomance
                iteration=final_dataframe.columns[final_dataframe.columns.astype(str).str[0]==i[0]].tolist()
            else:
                iteration=final_dataframe.columns.tolist()

            # let's iterate through all the words in the cleaned query    
            for j in iteration:
                dis=edit_distance(j, i, transpositions=True)
                if dis>=distince_thresh:
                    dis=0
                else:
                    #inverse of the distance - correct direction of the value
                    dis=1/dis

                levin.append(dis)
                
            final_dataframe[iteration]=levin

    # plug in the ones we have - ths the dtm for the user query
    final_dataframe[we_have_it]=1
    
    # some info on fuzzy matching 
    if show_more is True:

        print('Words with perfect matching: ', we_have_it)
        print('Words with no matching - fuzzy matching applied: ', unmatched)
        # print(final_predictions.sort_values(by='proba', ascending=False).head(10))
                                    
    # we can output top 1 prediction on user query  
    if certanty_thresh == None:
        
        return mini_model.predict(final_dataframe)[0]

    # output all categories that fall under our "credible region" - recommending the best matches
    else:
        # df with  final predictions
        final_predictions=pd.DataFrame({'products':mini_model.classes_.reshape(-1), 
                                        'proba':mini_model.predict_proba(final_dataframe).reshape(-1)})
        # Credible region
        proba_thresh = (final_predictions.proba.max())*certanty_thresh

        # final list of matching products
        at_last = final_predictions[final_predictions.proba>proba_thresh]\
                                            .sort_values('proba',ascending=False).products.tolist()

        # if there is no good match (give the thresh the output = 50% all products in DB)
        if len(at_last) >= round(len(final_predictions.products)/2):
            return '''There are no good matches to your request :(\nTry rephrazing.'''

        else:
            return at_last 

In [60]:
# test - write your query
query='''dust mask'''

# results
predicted_categories = final_quering(query, certanty_thresh=0.95)
print(predicted_categories)

['Coarse dust mask', 'Fine dust mask', 'Signs tested in practice']


In [65]:
# let's print the manufacturers for the recomended products 
output = manufacturers[manufacturers.sub_title.isin(predicted_categories)][['sub_title',
                                                                        'manufacturers']].reset_index(drop=True)

# make sure the order is correct - according to model's predictions
output['sort_cat'] = pd.Categorical(output['sub_title'], categories=predicted_categories, ordered=True)
output.sort_values('sort_cat', inplace=True)
output.reset_index(inplace=True, drop=True)
output.drop(columns='sort_cat', inplace=True)

output

Unnamed: 0,sub_title,manufacturers
0,Coarse dust mask,"[Dräger, Vitrex]"
1,Fine dust mask,"[3M, Draper, FTUK, MoldexMetric, Precision Tec..."
2,Signs tested in practice,"[Brady, Eurokraft, Phoenix, Phoenix Contact, R..."
