In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

Load Data

In [2]:
def load_data(base_path):
    products = pd.read_csv(f"{base_path}/data/raw/products.csv")
    line_items = pd.read_csv(f"{base_path}/data/raw/store_sales_line_items.csv")

    merged = line_items.merge(products, on="product_id")
    return merged


Build Transaction Baskets

In [3]:
def build_baskets(merged_df):
    baskets = (
        merged_df
        .groupby("transaction_id")["product_name"]
        .apply(list)
        .reset_index()
        .rename(columns={"product_name": "basket"})
    )
    return baskets


Generate Product Pairs

In [4]:
def generate_pairs(baskets_df):
    pairs = []

    for basket in baskets_df["basket"]:
        unique_items = set(basket)
        if len(unique_items) > 1:
            pairs.extend(combinations(sorted(unique_items), 2))

    return pd.Series(pairs)


Compute Support, Confidence & Lift

In [5]:
def compute_affinity_metrics(pairs, baskets_df):
    total_transactions = len(baskets_df)

    pair_counts = pairs.value_counts()

    item_counts = (
        baskets_df["basket"]
        .explode()
        .value_counts()
    )

    results = []

    for (item_a, item_b), pair_count in pair_counts.items():
        support = pair_count / total_transactions
        confidence = pair_count / item_counts[item_a]
        lift = confidence / (item_counts[item_b] / total_transactions)

        results.append({
            "product_A": item_a,
            "product_B": item_b,
            "support": round(support, 4),
            "confidence": round(confidence, 4),
            "lift": round(lift, 4)
        })

    return pd.DataFrame(results)


Filter & Rank Results

In [6]:
def get_top_affinities(
    affinity_df,
    min_support=0.01,
    min_confidence=0.1,
    top_k=10
):
    filtered = affinity_df[
        (affinity_df["support"] >= min_support) &
        (affinity_df["confidence"] >= min_confidence)
    ]

    return filtered.sort_values("lift", ascending=False).head(top_k)


End-to-End Runner

In [7]:
def run_affinity_pipeline(base_path):
    merged = load_data(base_path)
    baskets = build_baskets(merged)
    pairs = generate_pairs(baskets)
    affinity_df = compute_affinity_metrics(pairs, baskets)
    return affinity_df
