In [70]:
import pandas as pd

import nltk
from gensim.models import Word2Vec 
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import requests, re, random

In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

def cleanRawText(text):
    """
    Input : String of Text
    Output: List of Words
    
    """
    #To lower
    text = text.lower()
    #Removing Unicode Characters
    text = text.encode('ascii', 'ignore').decode()
    #Removing Links
    text = re.sub("https*\S+", "", text)
    text = re.sub("www.*\S+", "", text)
    #Removing Ticks and the Next Character
    text = re.sub("\'\w+", '', text)
    #Removing Punctuation except '.'
    text = re.sub("[^\w\s]", '', text)
    #Removing Numbers and Extra Lines
    text = re.sub("[\r\n\d\_]", ' ', text)
    #Removing Extra Spaces
    text = re.sub(" +", ' ', text)
    
    #Removing StopWords
    text = [word for word in text.split(' ') if word not in stop_words]
    return text

In [5]:
df = pd.read_csv('ecommerce.csv')

In [6]:
df.head()

Unnamed: 0,_unit_id,relevance,relevance:variance,product_image,product_link,product_price,product_title,query,rank,source,url,product_description
0,711158459,3.67,0.471,http://thumbs2.ebaystatic.com/d/l225/m/mzvzEUI...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$329.98,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,1,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
1,711158460,4.0,0.0,http://thumbs3.ebaystatic.com/d/l225/m/mJNDmSy...,http://www.ebay.com/itm/Sony-PlayStation-4-Lat...,$324.84,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,playstation 4,2,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
2,711158461,4.0,0.0,http://thumbs4.ebaystatic.com/d/l225/m/m10NZXA...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$324.83,Sony PlayStation 4 PS4 500 GB Jet Black Console,playstation 4,3,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
3,711158462,3.67,0.471,http://thumbs2.ebaystatic.com/d/l225/m/mZZXTmA...,http://www.ebay.com/itm/Sony-PlayStation-4-500...,$350.00,Sony - PlayStation 4 500GB The Last of Us Rema...,playstation 4,4,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,
4,711158463,3.33,0.471,http://thumbs3.ebaystatic.com/d/l225/m/mzvzEUI...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$308.00\nTrending at\n$319.99,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,5,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...


In [7]:
df = df.dropna(subset=['product_title', 'product_description'])

In [9]:
df_text = list(df.product_description.apply(cleanRawText))

In [23]:
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield TaggedDocument(list_of_words, [i])
        
dataDoc = list(tagged_document(df_text))

In [24]:
modelDoc = Doc2Vec(vector_size=40, min_count=2, epochs=30)

modelDoc.build_vocab(dataDoc)

modelDoc.train(dataDoc, total_examples=model.corpus_count, epochs=10)

print(modelDoc)

Doc2Vec(dm/m,d40,n5,w5,mc2,s0.001,t3)


In [26]:
list(modelDoc.wv.vocab)[:20]

['playstation',
 'system',
 'opens',
 'door',
 'incredible',
 'journey',
 'immersive',
 'new',
 'gaming',
 'worlds',
 'deeply',
 'connected',
 'community',
 'step',
 'living',
 'breathing',
 'hero',
 'epic',
 'explore',
 'gritty']

In [33]:
modelDoc.wv.most_similar('gaming')

[('gamers', 0.8947846293449402),
 ('gameplay', 0.8494644165039062),
 ('broadcast', 0.8432755470275879),
 ('immersive', 0.8375876545906067),
 ('gamer', 0.8348780274391174),
 ('interactive', 0.8230664730072021),
 ('listening', 0.8209339380264282),
 ('amplified', 0.8180497884750366),
 ('entertainment', 0.8026647567749023),
 ('communication', 0.7968127727508545)]

In [106]:
def similar_products(product):
    productDes = cleanRawText(df.product_description.loc[(df.product_title == product)].values[0])
    productVec = modelDoc.infer_vector(productDes)
    similar = modelDoc.docvecs.most_similar([productVec])
    for tag, similarity in similar:
        print(df.product_title.iloc[tag])
        print(similarity)

In [107]:
def random_product():
    randomProduct = df.product_title.sample(1).values[0]
    print(randomProduct)
    print('----------------')
    similar_products(randomProduct)

In [108]:
random_product()

Seiko Clocks Melodies in Motion Victoria Wall clock #QXM498GRH
----------------
Seiko Clocks Melodies in Motion Victoria Wall clock #QXM498GRH
0.9362793564796448
Seiko Clocks Melodies in Motion Marquis wall clock #QXM496BRH
0.8569459915161133
Antique Replica Just On Time London England Large Wood Table Clock
0.7913411259651184
Bohemian 5 Light Candle Chandelier Finish: Polished Brass, Crystal Type: Swarovski Spectra
0.764927864074707
Bohemian 10 Light Candle Chandelier Finish: Chrome, Crystal Type: Majestic Wood Polished
0.75298672914505
Bohemian 8 Light Candle Chandelier Finish: Polished Brass, Crystal Type: Swarovski Spectra
0.7524799704551697
Manor court Wall Clock by Bulova - 11.25 Inches Wide
0.7293360829353333
Woodland Imports Artistically Timeless Metal Table Clock
0.7147550582885742
Metal/ Wood Mesh Pattern Wall Clock
0.6795026659965515
Fossil Women's Jaqueline Watch in Polished Rose Goldtone with Sand Leather Strap Women's
0.6725337505340576


In [109]:
random_product()

Cotton Tale Nightingale Decorative Pillow Pack
----------------
Cotton Tale Nightingale Decorative Pillow Pack
0.9125806093215942
Cotton Tale Penny Lane Pillow Pack
0.79416424036026
Cotton Tale Sundance Pillow Pack
0.7909402251243591
Layla Chenille Bedspread (Shams Sold Separately)
0.7799346446990967
Zuma 18x18 Throw Pillows (Set of 2)
0.7785131335258484
PB Paws Pet Collection Puppy Paws Gold Tapestry Decorative Pillows (Set of 2)
0.7769656181335449
Layla King Bedspread by LaMont
0.7683161497116089
Ralph Lauren Girl's Floral Cotton Sateen Dress - Yellow - Size 14
0.7530744075775146
Madison Park Sheridan 7-Piece Comforter Set
0.7436462640762329
16 Schwinn Cosmo Boys' Bike"
0.7389103174209595


In [110]:
random_product()

Clarins Extra-Firming Eye Wrinkle 0.5-ounce Smoothing Cream
----------------
RoC Retinol Correxion Eye Cream
0.94133460521698
Boots No7 Lift & Luminate Eye Cream, .5 fl oz
0.9325252771377563
Lifting/Firming Solutions Kit by Estee Lauder for Unisex - 2 Pc Kit 1.7oz Perfectionist (CP+R) Wrinkle Lifting Firming Serum - All Skin Types, 1.7oz Resilience Lift Firming/Sculpting Face and Neck Cream SPF 15 - Normal/Combination Skin
0.9190438985824585
Cellular Laboratories De-Aging Eye Creme
0.9168297052383423
RoC Retinol Correxion Eye Cream
0.9123532772064209
Clinique All About Eye Rich 0.5-ounce Cream
0.9102678298950195
Muk Luks Alaska Short Snow Boot
0.9096645712852478
Harley-Davidson Safety Eyewear 883-HD510 Metal Frame Blue Mirrorlens
0.9074656963348389
Neutrogena Rapid Wrinkle Repair Eye Cream
0.9047814607620239
EDGE DiskGO Portable - hard drive - 500 GB - USB 2.0
0.9032014608383179


In [111]:
random_product()

Voodoo Tactical Mens Coyote Tan Horizontal Shotgun Shell Tactical Pouch Bag NEW
----------------
Voodoo Tactical Mens Coyote Tan Horizontal Shotgun Shell Tactical Pouch Bag NEW
0.9118974208831787
Overland 22-inch Oversized Duffel Bag
0.7414513826370239
NcSTAR CV12SHCU Vism By Ncstar Tactical Shotshell Carrier-Urban Gray
0.7383074164390564
Voodoo Tactical MOLLE Compatible Shotgun Shell Ammo Pouch - A-TACS
0.7188911437988281
Black MOLLE Shotgun Shell Ammo Pouch from Charlie Delta Tactical
0.6985151767730713
Victorinox Centurion Multipurpose Tool
0.6958184242248535
Victorinox Swiss Army Outrider Multipurpose Tool
0.6896765232086182
EdgeWork Pick Your Poison Full Tang Tactical Belt Knife
0.6771774291992188
Alpine Swiss Mens Trifold Wallet Genuine Leather Card Case ID Window Billfold NW Brown One Size
0.6765384674072266
Stansport Traveler Duffle Bag, 18 x 36""
0.6753666400909424


In [112]:
modelDoc.save("d2v.model")