In [10]:
import pandas as pd
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
import numpy as np 
import random 
from tqdm import tqdm


In [4]:
# load data 
df = pd.read_csv('OnlineRetail.csv',encoding='unicode_escape')

In [6]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [7]:
# remove missing values
df.dropna(inplace=True)

In [8]:
df['StockCode']= df['StockCode'].astype(str)
customers = df["CustomerID"].unique().tolist()
len(customers)

4372

In [9]:
# shuffle customer ID's
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

In [11]:
# list to capture purchase history of the customers
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

100%|██████████| 3935/3935 [00:00<00:00, 5262.57it/s]


In [39]:
# list to capture purchase history of the customers
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

100%|██████████| 437/437 [00:00<00:00, 4974.73it/s]


In [18]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(3603333, 3642460)

In [21]:
# extract all vectors
X = model.wv[model.wv.index_to_key]

X.shape

(3161, 100)

In [40]:
products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products.drop_duplicates(inplace=True, subset='StockCode', keep="last")


In [46]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.wv.most_similar(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms       

In [52]:
similar_products(model.wv['90019A'])

[('SILVER M.O.P ORBIT DROP EARRINGS', 0.7437092661857605),
 ('GREEN HEART OF GLASS BRACELET', 0.7281607985496521),
 ('PINK BOUDICCA LARGE BRACELET', 0.7258206009864807),
 ('BLUE MURANO TWIST BRACELET', 0.7206528186798096),
 ('ANT COPPER RED BOUDICCA BRACELET', 0.7111385464668274),
 ('SILVER LARIAT 40CM', 0.7006263136863708)]