In [2]:
import pandas as pd
import numpy as np
import os
import pyvis
import networkx as nx
from matplotlib import pyplot as plt
from itertools import product

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [4]:
current_directory = os.getcwd()

In [5]:
skus_data = pd.read_csv(f"{current_directory}/data/skus.csv", delimiter = ",")
transactions_data = pd.read_csv(f"{current_directory}/data/transactions.csv", delimiter = ",")

In [9]:
item_purchases = transactions_data.groupby(["SKU"]).agg(
    {"TRANSACTION_ID" : "count"}
).reset_index().rename(columns = {"TRANSACTION_ID" : "TOTAL_TRANSACTION"})

In [8]:
transactions_data = transactions_data.merge(item_purchases, on = "SKU", how = "left")

In [None]:
# Create all possible pairs
pairs = list(product(skus_data["SKU"].sort_values(), repeat=2))

# # Create a new DataFrame with the pairs
pairs_df = pd.DataFrame(pairs, columns=["SKU_1", "SKU_2"])
pairs_df["PurchasesCount"] = 0

# pairs_df["SKU1"], pairs_df["SKU2"] = zip(*pairs_df.apply(lambda row: sorted([row["SKU1"], row["SKU2"]]), axis=1))
# pairs_df = pairs_df.drop_duplicates()
pairs_df = pairs_df[pairs_df["SKU_1"] != pairs_df["SKU_2"]]

In [None]:
skus_per_transaction = transactions_data.groupby(["TRANSACTION_ID"]).agg({
    "SKU" : lambda x : set(list(x))
}).reset_index()

skus_per_transaction["SKU"] = skus_per_transaction.SKU.map(lambda x : sorted(x))

## Create all possible pairs (including self-pairs)
skus_per_transaction["AllPairs"] = skus_per_transaction["SKU"].map(lambda x : list(product(x, repeat=2)))

In [None]:
for _, row in skus_per_transaction.iterrows():
    list_pairs = row["AllPairs"]
    for _pair in list_pairs:
        sku_id1 = _pair[0]
        sku_id2 = _pair[1]
        # if sku_id1 < sku_id2:
        pairs_df.loc[(pairs_df["SKU_1"]==sku_id1) & (pairs_df["SKU_2"]==sku_id2), "PurchasesCount"] += 1

In [None]:
pairs_df = pairs_df.merge(item_purchases, left_on = "SKU_1", right_on = "SKU", how = "left").drop(columns = "SKU").rename(columns = {"TOTAL_TRANSACTION" : "TOTAL_TRANSACTIONS_SKU_1"})
pairs_df = pairs_df.merge(item_purchases, left_on = "SKU_2", right_on = "SKU", how = "left").drop(columns = "SKU").rename(columns = {"TOTAL_TRANSACTION" : "TOTAL_TRANSACTIONS_SKU_2"})

In [None]:
skus_data1 = skus_data.copy()
skus_data1.columns = [str(x)+"_1" for x in skus_data1.columns]

skus_data2 = skus_data.copy()
skus_data2.columns = [str(x)+"_2" for x in skus_data2.columns]

In [None]:
pairs_df = pairs_df.merge(skus_data1, on = "SKU_1", how = "left")
pairs_df = pairs_df.merge(skus_data2, on = "SKU_2", how = "left")

In [None]:
pairs_df["RATIO_SKU_1"] = pairs_df.apply(lambda x : x.PurchasesCount / x.TOTAL_TRANSACTIONS_SKU_1 if x.TOTAL_TRANSACTIONS_SKU_1 else 0, axis = 1)
pairs_df["RATIO_SKU_2"] = pairs_df.apply(lambda x : x.PurchasesCount / x.TOTAL_TRANSACTIONS_SKU_2 if x.TOTAL_TRANSACTIONS_SKU_2 else 0, axis = 1)

In [None]:
pairs_df["MAX_RATIO"] = pairs_df.apply(lambda x : max(x.RATIO_SKU_1, x.RATIO_SKU_2), axis = 1)

In [None]:
def filter_nbre_transactions(df, threshold_transactions):
   df = df[df["TOTAL_TRANSACTIONS_SKU_1"]>=threshold_transactions]
   df = df[df["TOTAL_TRANSACTIONS_SKU_2"]>=threshold_transactions]
   return df

In [None]:
pairs_df_filtred = filter_nbre_transactions(pairs_df, 6)
tt = pairs_df_filtred[(pairs_df_filtred["MAX_RATIO"]>=0.10)]
tt = tt[(tt["PurchasesCount"]>=3)]

In [None]:
ll = tt.groupby(["SKU_1"]).agg(
    nbre_halo_effect = ("SKU_2", "count")
).reset_index()

ll["nbre_halo_effect"].value_counts().sort_index()