# Product Similarity using Text Description & Gensim 

Import base libraries:

In [1]:
import pandas as pd
import numpy as np
import scipy as sp

Load in product dataset, and concatenate all available product text into a column called "all_product_text":

(the dataset was downloaded from https://www.kaggle.com/c/retail-products-classification/)

In [2]:
product_data = pd.read_csv( 'C://Users//jbolton//Downloads//product_text_data//train.csv' ) 
print( f'product data has {product_data.shape[0]} rows and {product_data.shape[1]} columns')

# replace null field values with empty string ''0
product_data['title'] = product_data['title'].fillna('')
product_data['description'] = product_data['description'].fillna('')
product_data['categories'] = product_data['categories'].fillna('')

# make a single field containing all of the product text:
product_data['all_product_text'] = product_data['title'] + ' ' + product_data['description'] + ' ' + product_data['categories']
#product_data = product_data.dropna( subset=['all_product_text'] ) # remove rows with NA product text
product_data.head()

product data has 46229 rows and 4 columns


Unnamed: 0,ImgId,title,description,categories,all_product_text
0,B000HYL1V6,TUNGSTEN SOLDER PICK WITH HANDLE,Solder Pick for picking up molten solder when ...,"Arts, Crafts & Sewing",TUNGSTEN SOLDER PICK WITH HANDLE Solder Pick f...
1,B00006HXWY,Write Right 98167 Screen Protector for Sony T615C,We all screen. And we all need to protect thos...,Cell Phones & Accessories,Write Right 98167 Screen Protector for Sony T6...
2,B000GAWSBS,Casio Mens DBC310-1 Databank 300 Digital Watch...,"Bringing you precision at a glance, the Casio ...","Clothing, Shoes & Jewelry",Casio Mens DBC310-1 Databank 300 Digital Watch...
3,B000040JOL,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Tools & Home Improvement,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...
4,B00006IB78,Energizer 2 in 1 Light,This twoway light features a bright flashlight...,Health & Personal Care,Energizer 2 in 1 Light This twoway light featu...


Have a look at the product text for the first 5 products:

In [3]:
documents_list = product_data['all_product_text'].values

for i in range(5):
    print( f'{documents_list[i]}\n\n' )

TUNGSTEN SOLDER PICK WITH HANDLE Solder Pick for picking up molten solder when making jewelry Arts, Crafts & Sewing


Write Right 98167 Screen Protector for Sony T615C We all screen. And we all need to protect those screens on our expensive little gadgetry. Our each is a pack of (12) clear screen protectors, each a generous 2-15/16" x 2-1/4" x .004" thick. They protect from scratches and reduce glare. Made for the Sony Cli, but easy to cut to fit anything with a screen you want to protect. Made in the USA for Fellowes. Cell Phones & Accessories


Casio Mens DBC310-1 Databank 300 Digital Watch (Discontinued by Manufacturer) Bringing you precision at a glance, the Casio Men's Databank Digital Watch #DBC310-1 features a blue-tone digital dial face with a durable mineral dial window. An auto-calendar displays the date and month. Other details that will keep you in control include a 150-page databank, an 8-digit calculator, a daily alarm, and a stopwatch function. To ensure easy wear, the l

Make all text lower-case, and remove punctuation and numbers:

In [4]:
import string
doclist_no_punc = [ myDoc.lower().translate( str.maketrans('', '', string.punctuation + string.digits) ) for myDoc in documents_list ]

# here is how the cleaned text looks for the first 5 products:
for i in range(5):
    print( f'{doclist_no_punc[i]}\n\n' )  

tungsten solder pick with handle solder pick for picking up molten solder when making jewelry arts crafts  sewing


write right  screen protector for sony tc we all screen and we all need to protect those screens on our expensive little gadgetry our each is a pack of  clear screen protectors each a generous  x  x  thick they protect from scratches and reduce glare made for the sony cli but easy to cut to fit anything with a screen you want to protect made in the usa for fellowes cell phones  accessories


casio mens dbc databank  digital watch discontinued by manufacturer bringing you precision at a glance the casio mens databank digital watch dbc features a bluetone digital dial face with a durable mineral dial window an autocalendar displays the date and month other details that will keep you in control include a page databank an digit calculator a daily alarm and a stopwatch function to ensure easy wear the light gray resin band is accompanied by a sturdy buckle clasp and both the m

Remove common words and tokenize (i.e. turn documents into bag/list of words):
(for future: note that "nltk" package has a nice list of stop words)

In [5]:
stoplist = set('for a of the and to in'.split() )

texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in doclist_no_punc
]

# here is the tokenized text for the first 5 products:
for i in range(5):
    print( f'{texts[i]}\n\n' ) 

['tungsten', 'solder', 'pick', 'with', 'handle', 'solder', 'pick', 'picking', 'up', 'molten', 'solder', 'when', 'making', 'jewelry', 'arts', 'crafts', 'sewing']


['write', 'right', 'screen', 'protector', 'sony', 'tc', 'we', 'all', 'screen', 'we', 'all', 'need', 'protect', 'those', 'screens', 'on', 'our', 'expensive', 'little', 'gadgetry', 'our', 'each', 'is', 'pack', 'clear', 'screen', 'protectors', 'each', 'generous', 'x', 'x', 'thick', 'they', 'protect', 'from', 'scratches', 'reduce', 'glare', 'made', 'sony', 'cli', 'but', 'easy', 'cut', 'fit', 'anything', 'with', 'screen', 'you', 'want', 'protect', 'made', 'usa', 'fellowes', 'cell', 'phones', 'accessories']


['casio', 'mens', 'dbc', 'databank', 'digital', 'watch', 'discontinued', 'by', 'manufacturer', 'bringing', 'you', 'precision', 'at', 'glance', 'casio', 'mens', 'databank', 'digital', 'watch', 'dbc', 'features', 'bluetone', 'digital', 'dial', 'face', 'with', 'durable', 'mineral', 'dial', 'window', 'an', 'autocalendar', 'display

Remove words appearing few times, and represent all documents as a list of lists (i.e. list of documents where each document is a list of words):

In [6]:
from collections import defaultdict    # a python dictionary with some desirable properties that aren't in the standard python dict()

# remove words that appear less than [min_word_freq] times:
min_word_freq = 1
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

# print out the first 5 documents:
for i in range(5):
    print( f'{texts[i]}\n\n' ) 

['tungsten', 'solder', 'pick', 'with', 'handle', 'solder', 'pick', 'picking', 'up', 'molten', 'solder', 'when', 'making', 'jewelry', 'arts', 'crafts', 'sewing']


['write', 'right', 'screen', 'protector', 'sony', 'tc', 'we', 'all', 'screen', 'we', 'all', 'need', 'protect', 'those', 'screens', 'on', 'our', 'expensive', 'little', 'gadgetry', 'our', 'each', 'is', 'pack', 'clear', 'screen', 'protectors', 'each', 'generous', 'x', 'x', 'thick', 'they', 'protect', 'from', 'scratches', 'reduce', 'glare', 'made', 'sony', 'cli', 'but', 'easy', 'cut', 'fit', 'anything', 'with', 'screen', 'you', 'want', 'protect', 'made', 'usa', 'fellowes', 'cell', 'phones', 'accessories']


['casio', 'mens', 'dbc', 'databank', 'digital', 'watch', 'discontinued', 'by', 'manufacturer', 'bringing', 'you', 'precision', 'at', 'glance', 'casio', 'mens', 'databank', 'digital', 'watch', 'dbc', 'features', 'digital', 'dial', 'face', 'with', 'durable', 'mineral', 'dial', 'window', 'an', 'autocalendar', 'displays', 'date', 

Define the dictionary and corpus in the format that GENSIM uses:

In [7]:
# define the dictionary and corpus
from gensim import corpora

dictionary = corpora.Dictionary(texts)                    # create dictionary of all unique words in [texts] object
corpus = [ dictionary.doc2bow(text) for text in texts ]   # create a corpus of all documents (a list containing a bag-of-words representation of each document)  

# have a look at the structure of the dictionary and the corpus:
print( 'how the words in the dictionary are indexed by gensim (first 5 words):' )
print( list( dictionary.token2id.items() )[:5] )

how the words in the dictionary are indexed by gensim (first 5 words):
[('arts', 0), ('crafts', 1), ('handle', 2), ('jewelry', 3), ('making', 4)]


In [8]:
# documents (products in this case) are stored in the corpus like this: 
#corpus[99]   # this is product in position 99 in the corpus

Import some GENSIM functions and models:

In [9]:
from gensim import similarities
from gensim import models
import gensim.downloader as gensim_api         # for accessing corpora and pre-trained models. see https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html.    
from gensim.similarities import WmdSimilarity  # for word movers distance

Here is a list of the pretrained word embeddings available in GENSIM:

In [10]:
gensim_info = gensim_api.info()

# list of pre-trained models available in gensim:
for model_name, model_data in sorted(gensim_info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

Load the pretrained 100-dimensional GloVE word embeddings trained on Wikipedia data: 

In [11]:
glove_wiki100_model = gensim_api.load("glove-wiki-gigaword-100")

I tested out Word Mover's Distance, but the algorithm is too slow to be useful for a dataset of this size:

In [12]:
# code for "word movers distance" using the {glove-wiki-gigaword-100} model:
#glove_wiki100_model.wmdistance('this is the first sentence', 'this be phrase number one')
#similarity_instance_wmd_glove_wiki100 = WmdSimilarity( texts, glove_wiki100_model, num_best=5 )
#get_top_matches = similarity_instance_wmd_glove_wiki100[ texts[99] ]

#print( 'query (to find matches for):\n' + ' '.join(texts[99]) + '\n' )
#for i in range(len(get_top_matches)):
#    print( f'-- match {i}' )
#    print( ' '.join( texts[i] ) )
#    print( '\n' )

Algorithm 1: Treat a document as an average over it's containing words (using the glove wiki-100 word embeddings):

In [13]:
def glove_wiki100_mean_word_representation( bagOfWords ):
    # this function calculates the average over all word vectors in the provided bag of words (using pretrained Glove Wiki-100 word embeddings)
    # my modification of the function from this source: http://yaronvazana.com/2018/09/20/average-word-vectors-generate-document-paragraph-sentence-embeddings/
    words_in_model_vocab = []
    for i in range(len(bagOfWords)):
        try:
            test = glove_wiki100_model[ bagOfWords[i] ]
            words_in_model_vocab += [ bagOfWords[i] ]
        except:
            pass
    if len(words_in_model_vocab) >= 1:
        return np.mean(glove_wiki100_model[words_in_model_vocab], axis=0) 
    else:
        return []
        
# here is an example (for item 19) :       
print( 'EXAMPLE OF FUNCTION USE:')
print( texts[19] )
glove_wiki100_mean_word_representation( texts[19] )

# get glove-wiki100 mean word vector for every product:
glove_wiki100_mean_word_vectors = [ glove_wiki100_mean_word_representation(word_i) for word_i in texts ]

EXAMPLE OF FUNCTION USE:
['top', 'brass', 'creme', 'formula', 'men', 'oz', 'g', 'nongreasy', 'control', 'mans', 'hair', 'makes', 'it', 'behave', 'right', 'style', 'easy', 'look', 'perfectly', 'groomed', 'this', 'nongreasy', 'cream', 'controls', 'mans', 'hair', 'so', 'it', 'behaves', 'way', 'it', 'should', 'imparts', 'healthy', 'shine', 'gives', 'extra', 'body', 'so', 'hair', 'is', 'easy', 'style', 'beauty']


Algorithm 2: Fit TF-IDF model:

In [14]:
tfidf_model = models.TfidfModel( corpus, smartirs='ntc' )

# Show the TF-IDF weights
#for doc in tfidf_model[corpus][0:5]:
#    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])
# [['first', 0.66], ['is', 0.24], ['line', 0.66], ['the', 0.24]]
# [['is', 0.24], ['the', 0.24], ['second', 0.66], ['sentence', 0.66]]
# [['document', 0.71], ['third', 0.71]]

# transform corpus to TF-IDF space and index it:
tfidf_index = similarities.MatrixSimilarity( tfidf_model[corpus] )

Algorithm 3: Fit Latent Semantic Indexing (LSI) model with 1000 topics (dimensions):

(note that the number of dimensions was chosen arbitrarily) 

In [15]:
# create Latent Semantic Indexing (LSI) model
lsi_dim = 1_000    # number of dimensions of LSI representation (number of unique topics)
lsi_model = models.LsiModel( corpus, id2word=dictionary, num_topics=lsi_dim )   

# transform corpus to LSI space and index it:
lsi_index = similarities.MatrixSimilarity( lsi_model[corpus] )      # for datasets which don't fit into memory, use similarities.Similarity class instead

Algorithm 4: Fit Latent Dirichlet Allocation (LDA) model with 1000 topics (dimensions):

(note that number of dimensions was chosen arbitrarily)

In [16]:
lda_dim = 1_000
lda_model = models.LdaModel( corpus, num_topics=lda_dim )

# transform corpus to LDA space and index it:
lda_index = similarities.MatrixSimilarity( lda_model[corpus] )      # for datasets which don't fit into memory, use similarities.Similarity class instead

Show some recommendation examples using the 4 algorithms specified above:

In [18]:
n_examples = 5        
top_n_products = 5     # number of product recommendations to fetch 
import random

for example_i in range(1,n_examples+1):
    
    # choose a random product:
    random_product_index = random.choice( range(len(documents_list)) )
    random_product = texts[ random_product_index ]

    # generate gensim bag-of-words representation of this random product:
    random_prodvec_bow = dictionary.doc2bow( texts[random_product_index] )    
    
    # get vector representation (embedding) of this random product under each model: 
    prodvec_tfidf = tfidf_model[ random_prodvec_bow ]  # convert the query to TF-IDF space
    prodvec_lsi = lsi_model[ random_prodvec_bow ]      # convert the query to LSI space
    prodvec_lda = lda_model[ random_prodvec_bow ]      # convert the query to TF-IDF space
    prodvec_mean_glove_wiki100 = glove_wiki100_mean_word_vectors[ random_product_index ] # Glove wiki-100 mean word vector representation for this random product
    
    # perform similarity queries:
    lda_sims = lda_index[ prodvec_lda ] 
    tfidf_sims = tfidf_index[ prodvec_tfidf ]  
    lsi_sims = lsi_index[ prodvec_lsi ]  
    dist_random_product_to_allprod_mean_glove_wiki100 = [ sp.spatial.distance.cosine(prodvec_mean_glove_wiki100, prod_i) for prod_i in glove_wiki100_mean_word_vectors ]
    closest_n_product_indices = np.array( dist_random_product_to_allprod_mean_glove_wiki100 ).argsort()[1:(top_n_products+1)]
    
    # print all of the results:
    print(
    f"""
    -----------------
    --- example {example_i} ---
    -----------------
    PRODUCT TO FIND MATCHES FOR (original product text): \n\t\t\t{documents_list[random_product_index]}
    ------------------------------------------------------------------------------------------------------------------------------------
    PRODUCT TO FIND MATCHES FOR (words seen by model): \n\t\t\t{' '.join( texts[random_product_index] )}
    ------------------------------------------------------------------------------------------------------------------------------------
    """
    )
    
    print( f'-- Closest products using LDA Model with {lda_dim} Dimensions (topics) --' )
    lda_sims = sorted(enumerate(lda_sims), key=lambda item: -item[1])
    for doc_position, doc_score in lda_sims[1:(top_n_products+1)]:
        print( 
            f"""
                MATCHING DOCUMENT: {documents_list[doc_position]}
                DISTANCE: {doc_score}
            """
        )

    print( f'\n-- Closest products using LSI Model with {lsi_dim} Dimensions (topics) --' )
    lsi_sims = sorted(enumerate(lsi_sims), key=lambda item: -item[1])
    for doc_position, doc_score in lsi_sims[1:(top_n_products+1)]:
        print( 
            f"""
                MATCHING DOCUMENT: {documents_list[doc_position]}
                DISTANCE: {doc_score}
            """
        )

    print( f'\n-- Closest products using TF-IDF Model -- ' )
    tfidf_sims = sorted(enumerate(tfidf_sims), key=lambda item: -item[1])
    for doc_position, doc_score in tfidf_sims[1:(top_n_products+1)]:
        print( 
            f"""
                MATCHING DOCUMENT: {documents_list[doc_position]}
                DISTANCE: {doc_score}
            """
        )   

    print( f'\n-- Closest products using average word vector (using pretrained Glove Wiki-100 word embeddings) -- ' )  
    for i in range(top_n_products):
        print( 
            f"""
                MATCHING DOCUMENT: { documents_list[closest_n_product_indices[i]] }
                DISTANCE: {dist_random_product_to_allprod_mean_glove_wiki100[i]}
            """
        )         
    
    print( '------------------------------------------------------------------------------------------------------------------------------------' )
    


    -----------------
    --- example 1 ---
    -----------------
    PRODUCT TO FIND MATCHES FOR (original product text): 
			HTC MyTouch 4G Slide Phone Standard Red LED Wall / AC / Home Charger! HTC MyTouch 4G Slide Phone Standard Red LED Wall / AC / Home Charger! Cell Phones & Accessories
    ------------------------------------------------------------------------------------------------------------------------------------
    PRODUCT TO FIND MATCHES FOR (words seen by model): 
			htc mytouch g slide phone standard red led wall ac home charger htc mytouch g slide phone standard red led wall ac home charger cell phones accessories
    ------------------------------------------------------------------------------------------------------------------------------------
    
-- Closest products using LDA Model with 1000 Dimensions (topics) --

                MATCHING DOCUMENT: HTC EVO 4G Phone Standard Red LED Wall / AC / Home Charger! HTC EVO 4G Phone Standard Red LED Wall / AC / Home 