In [13]:
# I totally used this: https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/

import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

df = pd.read_csv('data/Online Retail.csv')
print(df.shape)
df.head()

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom


In [14]:
# check for missing values
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [19]:
# remove missing values
df.dropna(inplace=True)

In [20]:
# stockcode to string
df.StockCode = df.StockCode.astype(str)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    406829 non-null  object 
 1   StockCode    406829 non-null  object 
 2   Description  406829 non-null  object 
 3   Quantity     406829 non-null  int64  
 4   InvoiceDate  406829 non-null  object 
 5   UnitPrice    406829 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      406829 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 27.9+ MB


In [22]:
# number of unique customers in our dataset:
customers = df["CustomerID"].unique().tolist()
len(customers)

4372

### train-test split

In [25]:
# shuffle then take 90 percent
random.shuffle(customers)

customers_train = [customers[i] for i in range(round(0.9* len(customers)))]

train_df = df[df['CustomerID'].isin(customers_train)]
test_df = df[~df['CustomerID'].isin(customers_train)]

In [27]:
# create purchase sequences for both test/ train sets

purchases_train = []

# tqdm - is a smart progress bar - wrap around any itterable
for i in tqdm(customers_train):
    temp = train_df[train_df['CustomerID']==i]['StockCode'].tolist()
    purchases_train.append(temp)                                          

100%|██████████| 3935/3935 [00:02<00:00, 1522.06it/s]


In [29]:
# do same for test

purchases_test = []

for i in tqdm(test_df['CustomerID'].unique()):
    temp = test_df[test_df['CustomerID']==i]['StockCode'].tolist()
    purchases_test.append(temp)

100%|██████████| 437/437 [00:00<00:00, 1865.11it/s]


In [44]:
purchases_test[3]

['20754',
 '22740',
 '22825',
 '22680',
 '22797',
 '21843',
 '22080',
 '22680',
 '20677',
 '20675',
 '20752',
 '21243',
 '21244',
 '22080',
 '47590A',
 '47590B',
 '47590B',
 '47590A',
 '21244',
 '21243',
 '20675',
 '20677',
 '22427',
 '20750',
 '23142',
 '23165']

### Build word2Vec

In [30]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(3620806, 3658090)

Since we are not planning to train the model any further, we are calling init_sims( ) here. This will make the model much more memory-efficient:

In [31]:
model.init_sims(replace=True)  # can't do any training after replace (only keep normalized ones in memory 
# can use most similar etc but NOT train on it

In [32]:
print(model)


Word2Vec(vocab=3175, vector_size=100, alpha=0.03)


In [55]:
# extract word vectore
# X = model[model.wv.vocab]
# X = model[model.wv.key_to_index.keys()]
# X.shape

In [46]:
# 100 dimensions

# compress to 2 using UMAP
import umap
cluster_embedding = umap.UMAP(n_neighbours=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)
plt.figure(figsize(10,9))
plt.scatter(cluster_embedding[:,0], cluster_embedding[:,1], s=3, cmap='Spectral')

### Recomendation

In [47]:
# create a product-ID and product-description dictionary to easily map a product’s description to its ID and vice versa.

products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()


In [48]:
# test the dictionary
products_dict['84029E']

['RED WOOLLY HOTTIE WHITE HEART.']

In [66]:
sim_words = model.wv.most_similar('84029E')
sim_words

for i in range(5):
    print(sim_words[i], products_dict[sim_words[i][0]])

('84029G', 0.8200174570083618) ['KNITTED UNION FLAG HOT WATER BOTTLE']
('21485', 0.7281389236450195) ['RETROSPOT HEART HOT WATER BOTTLE']
('21479', 0.7165849208831787) ['WHITE SKULL HOT WATER BOTTLE ']
('22112', 0.7139551639556885) ['CHOCOLATE HOT WATER BOTTLE']
('22111', 0.7022193670272827) ['SCOTTIE DOG HOT WATER BOTTLE']


In [60]:
# def similar_products(v, n = 6):
    
#     # extract most similar products for the input vector
# #     ms = model.similar_by_vector(v, topn= n+1)[1:]
#     ms = model.wv.most_similar(v, topn= n+1)[1:]
    
#     # extract name and similarity score of the similar products
#     new_ms = []
#     for j in ms:
#         pair = (products_dict[j[0]][0], j[1])
#         new_ms.append(pair)
        
#     return new_ms  

In [65]:
# # similar_products(model['90019A'])
# similar_products(model.wv.get_item('90019A'))
# # model.wv.get_item() 

In [51]:
# if want more than a single product, 
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [52]:
len(purchases_test[0])

2491

In [54]:
aggregate_vectors(purchases_test[0]).shape

TypeError: 'Word2Vec' object is not subscriptable

In [None]:
similar_products(aggregate_vectors(purchases_val[0]))

In [None]:
# Below I am giving only the last 10 products purchased as input:

similar_products(aggregate_vectors(purchases_val[0][-10:]))

# Data clean up - from apriori "reading"

In [None]:
# Data clean up - from apriori "reading"
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]
df

In [7]:
# one transaction per row, and only include for france to keep dataset small
basket = (df[df['Country']=='France']
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))
basket

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# need anything >1=>1, <0 =>0
def encode_units(x):
    if x <=0:
        return 0
    if x >=1:
        return 1
basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)
basket_sets

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules.head()

In [None]:
# add further constraint on rules
rules[(rules['lift'] >=6) &
      (rules['confidence'] >= 0.8)]