In [1]:
import pandas as pd
import numpy as np
import os
import pyvis
import networkx as nx
from matplotlib import pyplot as plt
from itertools import product

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [3]:
current_directory = os.getcwd()

In [4]:
skus_data = pd.read_csv(f"{current_directory}/data/skus.csv", delimiter = ",")
transactions_data = pd.read_csv(f"{current_directory}/data/transactions.csv", delimiter = ",")

In [5]:
item_purchases = transactions_data.groupby(["SKU"]).agg(
    {"TRANSACTION_ID" : "count"}
).reset_index().rename(columns = {"TRANSACTION_ID" : "TOTAL_TRANSACTION"})

item_purchases["SUPPORT"] = item_purchases["TOTAL_TRANSACTION"]/item_purchases["TOTAL_TRANSACTION"].sum()

In [6]:
transactions_data = transactions_data.merge(item_purchases, on = "SKU", how = "left")

# Model implementation

### Count number of times a pair of items is bought together

In [7]:
# Create all possible pairs
pairs = list(product(skus_data["SKU"].sort_values(), repeat=2))

# # Create a new DataFrame with the pairs
pairs_df = pd.DataFrame(pairs, columns=["SKU_1", "SKU_2"])
pairs_df["TRANSACTION_COUNT"] = 0

# pairs_df["SKU1"], pairs_df["SKU2"] = zip(*pairs_df.apply(lambda row: sorted([row["SKU1"], row["SKU2"]]), axis=1))
# pairs_df = pairs_df.drop_duplicates()
pairs_df = pairs_df[pairs_df["SKU_1"] != pairs_df["SKU_2"]]

In [8]:
skus_per_transaction = transactions_data.groupby(["TRANSACTION_ID"]).agg({
    "SKU" : lambda x : set(list(x))
}).reset_index()

skus_per_transaction["SKU"] = skus_per_transaction.SKU.map(lambda x : sorted(x))

## Create all possible pairs (including self-pairs)
skus_per_transaction["AllPairs"] = skus_per_transaction["SKU"].map(lambda x : list(product(x, repeat=2)))

In [9]:
for _, row in skus_per_transaction.iterrows():
    list_pairs = row["AllPairs"]
    for _pair in list_pairs:
        sku_id1 = _pair[0]
        sku_id2 = _pair[1]
        # if sku_id1 < sku_id2:
        pairs_df.loc[(pairs_df["SKU_1"]==sku_id1) & (pairs_df["SKU_2"]==sku_id2), "TRANSACTION_COUNT"] += 1

In [10]:
item_purchases1 = item_purchases.copy()
item_purchases1.columns = [str(x)+"_1" for x in item_purchases1.columns]

item_purchases2 = item_purchases.copy()
item_purchases2.columns = [str(x)+"_2" for x in item_purchases2.columns]

pairs_df = pairs_df.merge(item_purchases1, on = "SKU_1", how = "left")
pairs_df = pairs_df.merge(item_purchases2, on = "SKU_2", how = "left")

In [11]:
skus_data1 = skus_data.copy()
skus_data1.columns = [str(x)+"_1" for x in skus_data1.columns]

skus_data2 = skus_data.copy()
skus_data2.columns = [str(x)+"_2" for x in skus_data2.columns]

pairs_df = pairs_df.merge(skus_data1, on = "SKU_1", how = "left")
pairs_df = pairs_df.merge(skus_data2, on = "SKU_2", how = "left")

In [13]:
pairs_df["RATIO_1"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.TOTAL_TRANSACTION_1 if x.TOTAL_TRANSACTION_1 else 0, axis = 1)
pairs_df["RATIO_2"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.TOTAL_TRANSACTION_2 if x.TOTAL_TRANSACTION_2 else 0, axis = 1)
pairs_df["MAX_RATIO"] = pairs_df.apply(lambda x : max(x.RATIO_1, x.RATIO_2), axis = 1)

# Manual thresholds

In [14]:
def filter_nbre_transactions(df, threshold_transactions):
   df = df[df["TOTAL_TRANSACTION_1"]>=threshold_transactions]
   df = df[df["TOTAL_TRANSACTION_2"]>=threshold_transactions]
   return df

In [15]:
pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
tt = pairs_df_filtred[(pairs_df_filtred["MAX_RATIO"]>=0.05)]
tt = tt[(tt["TRANSACTION_COUNT"]>=2)]

In [16]:
tt["SKU_1"].nunique()

249

In [17]:
ll = tt.groupby(["SKU_1"]).agg(
    nbre_halo_effect = ("SKU_2", "count")
).reset_index()

ll["nbre_halo_effect"].value_counts().sort_index()

nbre_halo_effect
1     66
2     69
3     29
4     33
5     13
      ..
26     1
37     1
42     1
63     1
76     1
Name: count, Length: 20, dtype: int64

# Apriori algorithm implementation

In [18]:
pairs_df["CONFIDENCE"] = pairs_df["TRANSACTION_COUNT"]/pairs_df["TOTAL_TRANSACTION_1"]
pairs_df["LIFT"] = pairs_df["CONFIDENCE"]/pairs_df["SUPPORT_1"]

In [24]:
pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
tt = pairs_df_filtred[(pairs_df_filtred["LIFT"]>1)]
tt = tt[(tt["TRANSACTION_COUNT"]>=2)]
tt

Unnamed: 0,SKU_1,SKU_2,TRANSACTION_COUNT,TOTAL_TRANSACTION_1,SUPPORT_1,TOTAL_TRANSACTION_2,SUPPORT_2,ITEM_DESCRIPTION_1,DEPARTMENT_1,CATEGORY_1,SUBCATEGORY1_1,SUBCATEGORY2_1,BRAND_1,ITEM_WEIGHT_1,ITEM_UNIT_OF_MEASURE_1,ITEM_DESCRIPTION_2,DEPARTMENT_2,CATEGORY_2,SUBCATEGORY1_2,SUBCATEGORY2_2,BRAND_2,ITEM_WEIGHT_2,ITEM_UNIT_OF_MEASURE_2,RATIO_1,RATIO_2,MAX_RATIO,CONFIDENCE,LIFT
831,8462,54690,2,5.0,0.001023,116.0,0.023727,Kagensan No.6 Sandshee,Bird,Accessory,Other,Na,Kagesan,0.440,KG,Reusable Bag,Misc.,Other,Bags,Na,R&R,,,0.400000,0.017241,0.400000,0.400000,391.120000
1001,9515,9517,2,17.0,0.003477,5.0,0.001023,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Frozen F/F Chicken,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,0.117647,0.400000,0.400000,0.117647,33.833910
1002,9515,9518,4,17.0,0.003477,8.0,0.001636,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Frozen F/F Lamb,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,0.235294,0.500000,0.500000,0.235294,67.667820
1110,9515,36027,3,17.0,0.003477,90.0,0.018409,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,0.176471,0.033333,0.176471,0.176471,50.750865
1363,9515,56439,2,17.0,0.003477,7.0,0.001432,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Rabbit Ears Packed,Dog,Treat,Natural,Other,T. Forrest,0.180,KG,0.117647,0.285714,0.285714,0.117647,33.833910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249439,66971,63417,2,17.0,0.003477,17.0,0.003477,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Salmon Mince,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,0.117647,0.117647,0.117647,0.117647,33.833910
249440,66971,63418,2,17.0,0.003477,34.0,0.006954,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Turkey Mince,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,0.117647,0.058824,0.117647,0.117647,33.833910
249469,66971,65783,2,17.0,0.003477,25.0,0.005114,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Green Beef Tripe,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,0.117647,0.080000,0.117647,0.117647,33.833910
249470,66971,65784,3,17.0,0.003477,10.0,0.002045,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Pork Mince,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,0.176471,0.300000,0.300000,0.176471,50.750865


In [25]:
tt["SKU_1"].nunique()

249