In [5]:
import os
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
import numpy as np
from autocorrect import Speller
from googletrans import Translator
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from mlxtend.frequent_patterns import apriori, association_rules
from collections import Counter
import scipy
from scipy.spatial.distance import pdist, squareform
import sys
os.getcwd()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vahid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vahid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'C:\\Users\\Vahid\\OneDrive - University of Toronto\\MMA - 2022 -\\Jupyter files\\BigBasket'

In [8]:
#path = '/Users/Vahid/Library/CloudStorage/OneDrive-UniversityofToronto/MMA - 2022 -/Jupyter files/Data/'
path = 'C:\\Users\\Vahid\\OneDrive - University of Toronto\\MMA - 2022 -\\Jupyter files\\Data\\'
df_raw = pd.read_excel(path+'IMB575-XLS-ENG.xls', sheet_name ='POS DATA' )

In [9]:
df_raw.head()

Unnamed: 0,Member,Order,SKU,Created On,Description
0,M09736,6468572,34993740,22-09-2014 22:45,Other Sauces
1,M09736,6468572,15669800,22-09-2014 22:45,Cashews
2,M09736,6468572,34989501,22-09-2014 22:45,Other Dals
3,M09736,6468572,7572303,22-09-2014 22:45,Namkeen
4,M09736,6468572,15669856,22-09-2014 22:45,Sugar


# Data summary

In [10]:
df_raw.shape

(62141, 5)

In [11]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62141 entries, 0 to 62140
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Member       62141 non-null  object
 1   Order        62141 non-null  int64 
 2   SKU          62141 non-null  int64 
 3   Created On   62141 non-null  object
 4   Description  62141 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.4+ MB


### Changing the type of order and sku to be object 

In [12]:
df_raw['SKU'] = df_raw['SKU'].astype('object')
df_raw['Order'] = df_raw['Order'].astype('object')

In [13]:
df_raw.describe(include = 'object')

Unnamed: 0,Member,Order,SKU,Created On,Description
count,62141,62141,62141,62141,62141
unique,106,8387,1732,8352,216
top,M38622,6738016,15668381,14-07-2014 11:02,Other Vegetables
freq,1438,42,1702,42,4606


# Cleaning Dataset

In [14]:
df_raw['Description'].unique()

array(['Other Sauces', 'Cashews', 'Other Dals', 'Namkeen', 'Sugar',
       'Banana', 'Sugar Cubes', 'Other Sweets', 'Other Rice Products',
       'Utensil Scrub Pads', 'Toor Dal', 'Buns & Pavs', 'Cakes',
       'Boiled Rice', 'Urad Dal', 'Jaggery', 'Raisins', 'Chips',
       'Almonds', 'Root Vegetables', 'Other Vegetables', 'Moong Dal',
       'Healthy Snacks', 'Raw Peanuts', 'Mosquito Repellent',
       'Whole Spices', 'Soya Products', 'Beans', 'Health Drinks',
       'Liquid Soaps & Bars', 'Washing Bars', 'Ghee', 'Organic F&V',
       'Sunflower Oils', 'Shoe Polish', 'Cream Biscuits', 'Cookies',
       'Snacky Nuts', 'Avalakki / Poha', 'Besan', 'Raw Rice',
       'Powdered Spices', 'Eggs', 'Corn Snacks', 'Bread',
       'Instant Noodles', 'Sooji & Rava', 'Gourd & Cucumber',
       'Shaving Cream, Foam & Gels', 'Brinjals', 'Hair Oil',
       'Ground Coffee', 'Other Dry Fruits', 'Basmati Rice', 'Face Wash',
       'Yogurt & Lassi', 'Wafers', 'Instant Pastas', 'Cooking Paste',
       'O

### Remove non meaningful description

In [15]:
df_raw = df_raw [~ df_raw['Description'].isin(['CFLs,cfls,7832553,2,0\r\n7831582,Channa Dal,channa-dal,7832491,2,0\r\n7832360,Cheese,cheese,7832363,2,0\r\n7831689,Chewing Gum,chewing-gum,7831685,2,0\r\n7832047,ChildrenS Books'])]

# Is there any null values?

In [16]:
df_raw.isnull().sum()

Member         0
Order          0
SKU            0
Created On     0
Description    0
dtype: int64

# Vectorization

In [17]:
df = (df_raw.groupby(['Order', 'Description'])['SKU'].size().unstack().reset_index().fillna(0).set_index('Order'))

df.head()

Description,After Shave,Agarbatti,Almonds,Aluminium Foil & Cling Wrap,Antiseptics,Avalakki / Poha,Ayurvedic,Ayurvedic Food,Baby Care Accessories,Baby Cereal,...,Vanaspati,Veg & Fruit,Vermicelli,Vinegar,Wafers,Washing Bars,Whole Grains,Whole Spices,Womens Deo,Yogurt & Lassi
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6422558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6422636,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6423338,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
6423534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6423959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Chaing to binary dataset 

In [25]:
def encode_u(x):
  if x < 1:
    return 0
  else:
    return 1

df = df.applymap(encode_u)
df.head(5)

Description,After Shave,Agarbatti,Almonds,Aluminium Foil & Cling Wrap,Antiseptics,Avalakki / Poha,Ayurvedic,Ayurvedic Food,Baby Care Accessories,Baby Cereal,...,Vanaspati,Veg & Fruit,Vermicelli,Vinegar,Wafers,Washing Bars,Whole Grains,Whole Spices,Womens Deo,Yogurt & Lassi
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6422558,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6422636,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6423338,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6423534,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6423959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Apriori Algorithm


### Set Minimum Support


In [26]:
frequentitemsets = apriori(df, min_support=0.05, use_colnames=True)
frequentitemsets.sort_values(['support'],ascending=True).reset_index(drop=True)



Unnamed: 0,support,itemsets
0,0.050680,"(Organic F&V, Banana)"
1,0.050680,"(Banana, Beans, Brinjals)"
2,0.051037,"(Beans, Exotic Vegetables)"
3,0.051634,"(Beans, Raw Rice)"
4,0.051753,"(Moong Dal, Toor Dal)"
...,...,...
120,0.272478,(Brinjals)
121,0.300382,(Gourd & Cucumber)
122,0.399118,(Beans)
123,0.414143,(Root Vegetables)


In [27]:
wooden_star_rules = association_rules(frequentitemsets, metric="lift", min_threshold=1)
wooden_star_rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True).iloc[:,0:7]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(Moong Dal),(Toor Dal),0.134868,0.152278,0.051753,0.383731,2.519945
1,(Toor Dal),(Moong Dal),0.152278,0.134868,0.051753,0.339859,2.519945
2,(Other Dals),(Moong Dal),0.232769,0.134868,0.077272,0.331967,2.461430
3,(Moong Dal),(Other Dals),0.134868,0.232769,0.077272,0.572944,2.461430
4,(Urad Dal),(Other Dals),0.102671,0.232769,0.054496,0.530778,2.280280
...,...,...,...,...,...,...,...
339,(Sugar),(Other Vegetables),0.159432,0.427856,0.070117,0.439791,1.027894
340,(Root Vegetables),(Sugar),0.414143,0.159432,0.067016,0.161820,1.014974
341,(Sugar),(Root Vegetables),0.159432,0.414143,0.067016,0.420344,1.014974
342,(Banana),(Other Dals),0.260315,0.232769,0.061293,0.235456,1.011543


# Cosine Similarity Measure between Baskets

In [28]:
cosine_sim = 1-pairwise_distances(df,metric='cosine')

In [29]:
cosine_sim = pd.DataFrame(cosine_sim)

In [30]:
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8376,8377,8378,8379,8380,8381,8382,8383,8384,8385
0,1.000000,0.119523,0.119523,0.000000,0.000000,0.000000,0.000000,0.292770,0.218218,0.0,...,0.133631,0.000000,0.169031,0.000000,0.000000,0.218218,0.188982,0.285714,0.109109,0.478091
1,0.119523,1.000000,0.200000,0.000000,0.316228,0.565685,0.316228,0.571548,0.182574,0.0,...,0.670820,0.424264,0.282843,0.182574,0.158114,0.182574,0.474342,0.358569,0.456435,0.400000
2,0.119523,0.200000,1.000000,0.000000,0.000000,0.141421,0.000000,0.163299,0.182574,0.0,...,0.111803,0.141421,0.141421,0.000000,0.316228,0.000000,0.158114,0.358569,0.091287,0.100000
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.223607,0.000000,0.000000,0.288675,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.316228,0.000000,0.000000,1.000000,0.000000,0.000000,0.129099,0.000000,0.0,...,0.000000,0.223607,0.223607,0.577350,0.250000,0.000000,0.000000,0.188982,0.144338,0.316228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8381,0.218218,0.182574,0.000000,0.288675,0.000000,0.258199,0.000000,0.298142,0.000000,0.0,...,0.204124,0.000000,0.258199,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.182574
8382,0.188982,0.474342,0.158114,0.000000,0.000000,0.447214,0.500000,0.387298,0.577350,0.0,...,0.530330,0.447214,0.670820,0.000000,0.000000,0.000000,1.000000,0.000000,0.433013,0.316228
8383,0.285714,0.358569,0.358569,0.000000,0.188982,0.169031,0.000000,0.195180,0.000000,0.0,...,0.267261,0.169031,0.000000,0.218218,0.188982,0.000000,0.000000,1.000000,0.109109,0.358569
8384,0.109109,0.456435,0.091287,0.000000,0.144338,0.258199,0.288675,0.447214,0.166667,0.0,...,0.408248,0.516398,0.387298,0.333333,0.288675,0.000000,0.433013,0.109109,1.000000,0.365148


In [31]:
#fig, ax = plt.subplots(figsize=(10,10))
#cax = ax.matshow(cosine_sim, interpolation='nearest')
#ax.grid(True)
#plt.title('Cosine Similairty Matrix by Basket')
#fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, .8,.9,1])
#plt.show()

# Dice Coefficient

### Dice Coefficient applied to each product in the basket

In [32]:
df_prod=df.T

a = pdist(df_prod, metric='dice')
Dice_sim_prod = pd.DataFrame(1-squareform(a))
Dice_sim_prod.columns= df.columns
Dice_sim_prod.index=df.columns
Dice_sim_prod.head(10)

Description,After Shave,Agarbatti,Almonds,Aluminium Foil & Cling Wrap,Antiseptics,Avalakki / Poha,Ayurvedic,Ayurvedic Food,Baby Care Accessories,Baby Cereal,...,Vanaspati,Veg & Fruit,Vermicelli,Vinegar,Wafers,Washing Bars,Whole Grains,Whole Spices,Womens Deo,Yogurt & Lassi
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
After Shave,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
Agarbatti,0.0,1.0,0.016949,0.019512,0.0,0.03352,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.024733,0.0,0.006601
Almonds,0.0,0.016949,1.0,0.036641,0.0,0.109777,0.003759,0.0,0.0,0.003831,...,0.0,0.007117,0.016129,0.01107,0.014493,0.007421,0.00692,0.149843,0.0,0.050465
Aluminium Foil & Cling Wrap,0.0,0.019512,0.036641,1.0,0.0,0.023047,0.0,0.0,0.0,0.0,...,0.0,0.0,0.025532,0.0,0.011976,0.0,0.010363,0.030369,0.0,0.054348
Antiseptics,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.068966,0.0,0.0,0.0,0.0,0.0,0.0
Avalakki / Poha,0.0,0.03352,0.109777,0.023047,0.0,1.0,0.0,0.003072,0.0,0.0,...,0.0,0.008721,0.058981,0.011976,0.00885,0.0,0.022727,0.220807,0.0,0.029579
Ayurvedic,0.0,0.0,0.003759,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004648,0.0,0.008163
Ayurvedic Food,0.0,0.0,0.0,0.0,0.0,0.003072,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002334,0.0,0.0
Baby Care Accessories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Baby Cereal,0.0,0.0,0.003831,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
Dice_sim_prod.shape

(215, 215)

In [None]:
#fig, ax = plt.subplots(figsize=(10,10))
#cax = ax.matshow(Dice_sim_prod, interpolation='nearest')
#ax.grid(True)
#plt.title('Dice Similairty Matrix by Product', fontsize=18)
#fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, .8,.9,1])
#plt.xlabel('Product Index', fontsize=18)
#plt.ylabel('Product Index', fontsize=18)
#plt.xticks( fontsize=18)
#plt.yticks( fontsize=18)
#plt.show()