# preprocessing

In [1]:
!git clone https://github.com/ItamarBerger/TabularDS

fatal: destination path 'TabularDS' already exists and is not an empty directory.


In [2]:
# load dataset
import pandas as pd
retails = pd.read_csv('/content/TabularDS/final_project/code/data/online_retail.csv')

In [3]:
retails.head()
# print(len(retails))

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [4]:
# transforming raw text into IDs of items
retails['DescriptionID'] = retails['Description'].astype('category').cat.codes

# aggregating each transaction as set of items for each Invoice
# retails_with_ids = retails.groupby('InvoiceNo')['DescriptionID'].apply(tuple).reset_index()
# retails_with_ids.head()

In [5]:
# convert dataset into novice-item table. just like done in recommendation systems.
df_onehot = pd.crosstab(retails['InvoiceNo'], retails['DescriptionID'])

# Rename columns to include "Item_" prefix for clarity
df_onehot.columns = [f'Item_{col}' for col in df_onehot.columns]


# truncate dataset for better time complexity experiment
# df_onehot = df_onehot.iloc[:5000]

In [6]:
df_onehot.head(40)

Unnamed: 0_level_0,Item_-1,Item_0,Item_1,Item_2,Item_3,Item_4,Item_5,Item_6,Item_7,Item_8,...,Item_4213,Item_4214,Item_4215,Item_4216,Item_4217,Item_4218,Item_4219,Item_4220,Item_4221,Item_4222
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536370,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
536371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536374,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

item_sim_matrix = cosine_similarity(df_onehot.T)  # Item-item similarity
item_sim_df = pd.DataFrame(item_sim_matrix, index=df_onehot.columns, columns=df_onehot.columns)


In [10]:
# perform item-based CF for rule mining
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_item_ratings(invoice_items, item_sim_df):
    """
    Calculate predicted rating for items not in the invoice.
    Rating(Item X) = sum(sim(Item X, item_in_invoice)) / sum(similarities)
    """
    scores = {}
    norm_factor = 0

    for item in invoice_items:
        similar_items = item_sim_df[item].drop(index=invoice_items, errors="ignore")  # Remove existing items
        for sim_item, similarity in similar_items.items():
            scores[sim_item] = scores.get(sim_item, 0) + similarity  # Aggregate similarity scores
            norm_factor += similarity  # Normalize by sum of similarities

    if norm_factor == 0:
        return {}

    # Normalize scores
    for item in scores:
        scores[item] /= norm_factor

    return scores

In [11]:
from tqdm.notebook import tqdm
threshold = 0.2  # Min rating for an item to be considered

rules = []
for invoice in tqdm(df_onehot.index,desc="Processing Raiting"):
    existing_items = df_onehot.columns[df_onehot.loc[invoice] == 1].tolist()
    predicted_ratings = compute_item_ratings(existing_items, item_sim_df)

    # Select items with rating above threshold
    recommended_items = {item for item, score in predicted_ratings.items() if score > threshold}

    if recommended_items:
        rules.append((set(existing_items), recommended_items))  # Format: {A, B, C} → {D, F}

rules_df = pd.DataFrame(rules, columns=["Antecedent", "Consequent"])


Processing Raiting:   0%|          | 0/5000 [00:00<?, ?it/s]

In [12]:
# compute Confidence and Lift
def compute_rule_metrics(rules_df, df_onehot):
    total_invoices = len(df_onehot)
    metrics = []

    for _, row in rules_df.iterrows():
        antecedent = row["Antecedent"]
        consequent = row["Consequent"]

        support_A = (df_onehot[list(antecedent)].sum(axis=1) == len(antecedent)).mean()
        support_B = (df_onehot[list(consequent)].sum(axis=1) == len(consequent)).mean()
        support_A_B = ((df_onehot[list(antecedent)].sum(axis=1) == len(antecedent)) &
                       (df_onehot[list(consequent)].sum(axis=1) == len(consequent))).mean()

        confidence = support_A_B / support_A if support_A > 0 else 0
        lift = confidence / support_B if support_B > 0 else 0
        interestingness = confidence * lift  # Tradeoff metric

        metrics.append((antecedent, consequent, confidence, lift, interestingness))

    return pd.DataFrame(metrics, columns=["Antecedent", "Consequent", "Confidence", "Lift", "Interestingness"])

rules_eval_df = compute_rule_metrics(rules_df, df_onehot)




In [13]:
# Select Top 100 Most Interesting Rules
top_100_rules = rules_eval_df.sort_values(by="Interestingness", ascending=False).head(100)
print(top_100_rules)

    Antecedent   Consequent  Confidence    Lift  Interestingness
0  {Item_4029}  {Item_3293}         0.5  2500.0           1250.0
