In [1]:
import pandas as pd

import nltk
from gensim.models import Word2Vec 
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import requests, re, random

In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

def cleanRawText(text):
    """
    Input : String of Text
    Output: List of Words
    
    """
    #To lower
    text = text.lower()
    #Removing Unicode Characters
    text = text.encode('ascii', 'ignore').decode()
    #Removing Links
    text = re.sub("https*\S+", "", text)
    text = re.sub("www.*\S+", "", text)
    #Removing Ticks and the Next Character
    text = re.sub("\'\w+", '', text)
    #Removing Punctuation except '.'
    text = re.sub("[^\w\s]", '', text)
    #Removing Numbers and Extra Lines
    text = re.sub("[\r\n\d\_]", ' ', text)
    #Removing Extra Spaces
    text = re.sub(" +", ' ', text)
    
    #Removing StopWords
    text = [word for word in text.split(' ') if word not in stop_words]
    return text

In [3]:
df = pd.read_csv('ecommerce.csv')

In [4]:
df.head()

Unnamed: 0,_unit_id,relevance,relevance:variance,product_image,product_link,product_price,product_title,query,rank,source,url,product_description
0,711158459,3.67,0.471,http://thumbs2.ebaystatic.com/d/l225/m/mzvzEUI...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$329.98,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,1,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
1,711158460,4.0,0.0,http://thumbs3.ebaystatic.com/d/l225/m/mJNDmSy...,http://www.ebay.com/itm/Sony-PlayStation-4-Lat...,$324.84,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,playstation 4,2,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
2,711158461,4.0,0.0,http://thumbs4.ebaystatic.com/d/l225/m/m10NZXA...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$324.83,Sony PlayStation 4 PS4 500 GB Jet Black Console,playstation 4,3,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
3,711158462,3.67,0.471,http://thumbs2.ebaystatic.com/d/l225/m/mZZXTmA...,http://www.ebay.com/itm/Sony-PlayStation-4-500...,$350.00,Sony - PlayStation 4 500GB The Last of Us Rema...,playstation 4,4,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,
4,711158463,3.33,0.471,http://thumbs3.ebaystatic.com/d/l225/m/mzvzEUI...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$308.00\nTrending at\n$319.99,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,5,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...


In [5]:
print(f'Total Products Originally: {len(df)}')
df = df.dropna(subset=['product_title', 'product_description'])
print(f'Total Products After Removing NA Values: {len(df)}')
df = df.drop_duplicates(subset=['product_title'])
print(f'Total Products After Removing Duplicates: {len(df)}')

Total Products Originally: 32671
Total Products After Removing NA Values: 24800
Total Products After Removing Duplicates: 22735


In [6]:
df_text = list(df.product_description.apply(cleanRawText))

In [7]:
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield TaggedDocument(list_of_words, [i])
        
dataDoc = list(tagged_document(df_text))

In [8]:
def similar_products(model, product):
    productDes = cleanRawText(df.product_description.loc[(df.product_title == product)].values[0])
    productVec = model.infer_vector(productDes)
    similar = model.docvecs.most_similar([productVec])
    for tag, similarity in similar:
        print(df.product_title.iloc[tag])
        print(similarity)

In [9]:
def random_product(model):
    randomProduct = df.product_title.sample(1).values[0]
    print(randomProduct)
    print('----------------')
    similar_products(model, randomProduct)

In [53]:
modelDoc1 = Doc2Vec(dm=0, vector_size=300, window_size=10, 
                    min_count=1, negative=5, sampling_threshold=10e-5, worker_count=-1)

modelDoc1.build_vocab(dataDoc)

modelDoc1.train(dataDoc, total_examples=modelDoc1.corpus_count, epochs=20)

print(modelDoc1)

Doc2Vec(dbow,d300,n5,s0.001,t3)


In [59]:
modelDoc2 = Doc2Vec(dm=1, vector_size=300, window_size=15, 
                    min_count=1, negative=5, sampling_threshold=10e-6, worker_count=-1)

modelDoc2.build_vocab(dataDoc)

modelDoc2.train(dataDoc, total_examples=modelDoc2.corpus_count, epochs=30)

print(modelDoc2)

Doc2Vec(dm/m,d300,n5,w5,s0.001,t3)


In [60]:
similar_products(modelDoc1, 'Varicraft AV-2M Avian Mixed Seed Feeder')

Varicraft AV-2M Avian Mixed Seed Feeder
0.957420825958252
Audubon 4 Port Brushed Copper Seed Feeder
0.8198021054267883
Droll Yankees X1 Seed Saver Domed Feeder
0.7937068343162537
Perky-Pet Mixed Seed Forest Green Lantern Wild Bird Feeder MGNCD00352
0.7746235728263855
More Birds Abundance Seed Feeder, Gray
0.7630696296691895
NO/NO Mixed Seed Lantern Wild Bird Feeder
0.7628263831138611
WoodLink Mixed Seed Metal Lantern Bird Feeder - Bird Feeders
0.7583979368209839
Stokes Select Jumbo Seed Tube Bird Feeder-JUMBO SEED TUBE FEEDER
0.752517580986023
Nature's Way Bamboo Hanging Platform Feeder
0.7522631287574768
Perky-Pet Sunflower Seed Red Lantern Wild Bird Feeder CNCD00351
0.748096227645874


In [61]:
similar_products(modelDoc2, 'Varicraft AV-2M Avian Mixed Seed Feeder')

Varicraft AV-2M Avian Mixed Seed Feeder
0.856534481048584
Mondevio Silver Peace Necklace
0.7336527705192566
Houndstooth Comforter Set - Size: Full / Queen, Color: Purple
0.7320859432220459
Screen Protectors for Samsung Galaxy Note 10.1 in. Tablet
0.7105859518051147
4 Film Favorites: Teenage Mutant Ninja Turtles (DVD)
0.7078649997711182
Graco Roomfor2 Classic Connect  Stand & Ride Stroller - Metropolis
0.707024335861206
Barnes High Street Classic Square Nickel Table Clock
0.7057576179504395
Victorinox Swiss Army Huntsman Sapphire Pocket Knife
0.7036020159721375
Thierry Mugler 'Alien' Women's 1-ounce Eau De Parfum Spray
0.6917333006858826
7' Pre-Lit Brinkley Pine Christmas Tree with Colonial Blue Decoration Kit
0.6886978149414062


In [122]:
class Model:
    
    def __init__(self, **kwargs):
        
        self.dm = 0
        self.vector_size = 300
        self.window_size = 15
        self.min_count = 1
        self.negative = 5
        self.sampling_threshold = 10e-5
        self.worker_count = -1
        
        self.__dict__.update(kwargs)

        self.model = Doc2Vec(**self.__dict__)

    def train(self, data, epochs=20):
        self.model.build_vocab(data)
        self.model.train(data, total_examples=self.model.corpus_count, epochs=epochs)
        
    def most_similar(self, tag):
        for tag,similarity in self.model.docvecs.most_similar(tag):
            print(df.product_title.iloc[tag])
            print(similarity)


In [118]:
model_1 = Model(vector_size=301)

In [119]:
model_1.train(dataDoc)

In [120]:
similar_products(model_1.model, 'Varicraft AV-2M Avian Mixed Seed Feeder')

Varicraft AV-2M Avian Mixed Seed Feeder
0.9408191442489624
Audubon 4 Port Brushed Copper Seed Feeder
0.8678876161575317
Droll Yankees X1 Seed Saver Domed Feeder
0.866484522819519
Perky-Pet Mixed Seed Forest Green Lantern Wild Bird Feeder MGNCD00352
0.8371147513389587
Woodstream Seed Barn Bird Feeder
0.8310158848762512
Crescent Moon Bird Feeder
0.8175684213638306
NO/NO Mixed Seed Lantern Wild Bird Feeder
0.8131281733512878
Nature's Way Bamboo Hanging Platform Feeder
0.8123725652694702
2-port Mixed Seed Bird Feeder
0.8088411688804626
More Birds Abundance Seed Feeder, Gray
0.8087459206581116


In [121]:
model_1.most_similar(2)

Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console *NEW*
0.9985468983650208
Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console
0.9982041716575623
Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console
0.9980528354644775
Sony PlayStation 4 (Latest Model) 500 GB Jet Black Console
0.9976085424423218
Sony PlayStation 4 500GB, Dualshock Wireless Control, HDMI Gaming Console Refurb
0.9975502490997314
Sony PlayStation 4 500GB Console with 2 Controllers
0.9973635673522949
PS4 Console Solution Bundle and Bonus Controller
0.9275268316268921
Playstation 4 Console (PS4), Refurbished
0.925129771232605
PS4 Console and Last of Us Game Bundle plus Choice of 2 Games
0.924584150314331
Sony PlayStation 4 - game console - 500 GB HDD
0.9217274188995361
