In [1]:
import pandas as pd
import numpy as np
import os
import pyvis
import networkx as nx
from matplotlib import pyplot as plt
from itertools import product

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [3]:
current_directory = os.getcwd()

In [4]:
skus_data = pd.read_csv(f"{current_directory}/data/skus.csv", delimiter = ",")
transactions_data = pd.read_csv(f"{current_directory}/data/transactions.csv", delimiter = ",")

# Data cleaning

### Remove Misc. department

In [5]:
misc_skus = skus_data[skus_data["DEPARTMENT"]=="Misc."]["SKU"].unique()

In [6]:
skus_data = skus_data[skus_data["DEPARTMENT"]!="Misc."]
transactions_data = transactions_data[~transactions_data["SKU"].isin(misc_skus)]

# SKU analysis
### Aggregate per sku and count total number of transactions and revenue generated

In [7]:
item_purchases = transactions_data.groupby(["SKU"]).agg(
    TOTAL_TRANSACTION = ("TRANSACTION_ID", "count"),
    TOTAL_REVENUE = ("REVENUE", "sum")
).reset_index()

item_purchases["SUPPORT"] = item_purchases["TOTAL_TRANSACTION"]/item_purchases["TOTAL_TRANSACTION"].sum()

In [8]:
transactions_data = transactions_data.merge(item_purchases, on = "SKU", how = "left")

# Model implementation

### Count number of times a pair of items is bought together

In [9]:
# Create all possible pairs
pairs = list(product(skus_data["SKU"].sort_values(), repeat=2))

# # Create a new DataFrame with the pairs
pairs_df = pd.DataFrame(pairs, columns=["SKU_1", "SKU_2"])
pairs_df["TRANSACTION_COUNT"] = 0

# pairs_df["SKU1"], pairs_df["SKU2"] = zip(*pairs_df.apply(lambda row: sorted([row["SKU1"], row["SKU2"]]), axis=1))
# pairs_df = pairs_df.drop_duplicates()
pairs_df = pairs_df[pairs_df["SKU_1"] != pairs_df["SKU_2"]]

In [10]:
skus_per_transaction = transactions_data.groupby(["TRANSACTION_ID"]).agg({
    "SKU" : lambda x : set(list(x))
}).reset_index()

skus_per_transaction["SKU"] = skus_per_transaction.SKU.map(lambda x : sorted(x))

## Create all possible pairs (including self-pairs)
skus_per_transaction["AllPairs"] = skus_per_transaction["SKU"].map(lambda x : list(product(x, repeat=2)))

In [11]:
for _, row in skus_per_transaction.iterrows():
    list_pairs = row["AllPairs"]
    for _pair in list_pairs:
        sku_id1 = _pair[0]
        sku_id2 = _pair[1]
        # if sku_id1 < sku_id2:
        pairs_df.loc[(pairs_df["SKU_1"]==sku_id1) & (pairs_df["SKU_2"]==sku_id2), "TRANSACTION_COUNT"] += 1

In [12]:
item_purchases1 = item_purchases.copy()
item_purchases1.columns = [str(x)+"_1" for x in item_purchases1.columns]

item_purchases2 = item_purchases.copy()
item_purchases2.columns = [str(x)+"_2" for x in item_purchases2.columns]

pairs_df = pairs_df.merge(item_purchases1, on = "SKU_1", how = "left")
pairs_df = pairs_df.merge(item_purchases2, on = "SKU_2", how = "left")

In [13]:
skus_data1 = skus_data.copy()
skus_data1.columns = [str(x)+"_1" for x in skus_data1.columns]

skus_data2 = skus_data.copy()
skus_data2.columns = [str(x)+"_2" for x in skus_data2.columns]

pairs_df = pairs_df.merge(skus_data1, on = "SKU_1", how = "left")
pairs_df = pairs_df.merge(skus_data2, on = "SKU_2", how = "left")

In [14]:
pairs_df["RATIO_1"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.TOTAL_TRANSACTION_1 if x.TOTAL_TRANSACTION_1 else 0, axis = 1)
pairs_df["RATIO_2"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.TOTAL_TRANSACTION_2 if x.TOTAL_TRANSACTION_2 else 0, axis = 1)
pairs_df["MAX_RATIO"] = pairs_df.apply(lambda x : max(x.RATIO_1, x.RATIO_2), axis = 1)

# Manual thresholds

In [15]:
def filter_nbre_transactions(df, threshold_transactions):
   df = df[df["TOTAL_TRANSACTION_1"]>=threshold_transactions]
   df = df[df["TOTAL_TRANSACTION_2"]>=threshold_transactions]
   return df

In [16]:
pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
tt = pairs_df_filtred[(pairs_df_filtred["MAX_RATIO"]>=0.05)]
tt = tt[(tt["TRANSACTION_COUNT"]>=2)]

In [17]:
tt["SKU_1"].nunique()

213

In [18]:
ll = tt.groupby(["SKU_1"]).agg(
    nbre_halo_effect = ("SKU_2", "count")
).reset_index()

ll["nbre_halo_effect"].value_counts().sort_index()

nbre_halo_effect
1     74
2     53
3     32
4     20
5      2
      ..
11     3
13     1
15     1
23     1
36     1
Name: count, Length: 15, dtype: int64

# Apriori algorithm implementation

In [19]:
pairs_df["CONFIDENCE"] = pairs_df["TRANSACTION_COUNT"]/pairs_df["TOTAL_TRANSACTION_1"]
pairs_df["LIFT"] = pairs_df["CONFIDENCE"]/pairs_df["SUPPORT_1"]

In [20]:
pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
tt = pairs_df_filtred[(pairs_df_filtred["LIFT"]>1)]
tt = tt[(tt["TRANSACTION_COUNT"]>=2)]

In [21]:
tt.sort_values(by = ["LIFT"])

Unnamed: 0,SKU_1,SKU_2,TRANSACTION_COUNT,TOTAL_TRANSACTION_1,TOTAL_REVENUE_1,SUPPORT_1,TOTAL_TRANSACTION_2,TOTAL_REVENUE_2,SUPPORT_2,ITEM_DESCRIPTION_1,DEPARTMENT_1,CATEGORY_1,SUBCATEGORY1_1,SUBCATEGORY2_1,BRAND_1,ITEM_WEIGHT_1,ITEM_UNIT_OF_MEASURE_1,ITEM_DESCRIPTION_2,DEPARTMENT_2,CATEGORY_2,SUBCATEGORY1_2,SUBCATEGORY2_2,BRAND_2,ITEM_WEIGHT_2,ITEM_UNIT_OF_MEASURE_2,RATIO_1,RATIO_2,MAX_RATIO,CONFIDENCE,LIFT
54701,36027,50086,2,90.0,242.2891,0.020930,8.0,8.4250,0.001860,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,Schmackos Poultry 20 Stk,Dog,Treat,Soft,Other,Pedigree,0.140,KG,0.022222,0.250000,0.250000,0.022222,1.061728
54719,36027,51306,2,90.0,242.2891,0.020930,6.0,6.6603,0.001395,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,Nvd Fd Bites Lamb,Dog,Treat,Other,Other,Natures Variety,0.020,KG,0.022222,0.333333,0.333333,0.022222,1.061728
54735,36027,52080,2,90.0,242.2891,0.020930,12.0,19.9170,0.002791,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,Collagen Retriever Roll 5 Chk,Dog,Treat,Hide,Other,Daily Eats,0.055,KG,0.022222,0.166667,0.166667,0.022222,1.061728
54737,36027,52082,2,90.0,242.2891,0.020930,5.0,8.2501,0.001163,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,Collagen Retriever Roll 5 Bf,Dog,Treat,Hide,Other,Daily Eats,0.055,KG,0.022222,0.400000,0.400000,0.022222,1.061728
54771,36027,52781,2,90.0,242.2891,0.020930,10.0,14.9502,0.002326,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,Beef Fillets,Dog,Treat,Other,Other,Lifestage,0.800,KG,0.022222,0.200000,0.200000,0.022222,1.061728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202210,62826,62808,5,6.0,75.0000,0.001395,6.0,100.0002,0.001395,Full Haircut 12kg,Services,Grooming,Dog,Groom,Jolly Groomer,,,Bath Brush & Dry 12kg Lhair,Services,Grooming,Dog,Groom,Jolly Groomer,,,0.833333,0.833333,0.833333,0.833333,597.222222
154496,54644,54656,5,6.0,13.7425,0.001395,9.0,26.0510,0.002093,Beef N Tripe Dinner With Veg,Dog,Food,Frozen,Adult,Durham Animal Feeds,0.50,KG,Duck And Tripe With Veg,Dog,Food,Frozen,Adult,Durham Animal Feeds,0.500,KG,0.833333,0.555556,0.833333,0.833333,597.222222
47724,30685,30686,4,5.0,6.6332,0.001163,6.0,3.3166,0.001395,Nibblots Apple Sa Treats,Sm.Animal,Treat,Other,Na,M&C,0.03,KG,Nibblots Berry Sa Treats,Sm.Animal,Treat,Other,Na,M&C,0.030,KG,0.800000,0.666667,0.800000,0.800000,688.000000
167699,54996,52777,4,5.0,9.9668,0.001163,12.0,27.4085,0.002791,Artisan Beef & Blueberry Bites,Dog,Treat,Other,Other,Lifestage,0.08,KG,Duck Medallions,Dog,Treat,Other,Other,Lifestage,0.100,KG,0.800000,0.333333,0.800000,0.800000,688.000000


In [22]:
item_purchases[item_purchases["SKU"].isin(tt["SKU_1"].unique())]["TOTAL_REVENUE"].sum()/item_purchases["TOTAL_REVENUE"].sum()

np.float64(0.5885458746413047)

In [23]:
tt["SKU_1"].nunique()

213