# preprocessing

In [1]:
!git clone https://github.com/ItamarBerger/TabularDS

Cloning into 'TabularDS'...
remote: Enumerating objects: 274, done.[K
remote: Counting objects: 100% (113/113), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 274 (delta 48), reused 26 (delta 3), pack-reused 161 (from 1)[K
Receiving objects: 100% (274/274), 21.96 MiB | 8.34 MiB/s, done.
Resolving deltas: 100% (108/108), done.
Updating files: 100% (14/14), done.


In [2]:
# load dataset
import pandas as pd
retails = pd.read_csv('/content/TabularDS/final_project/code/data/online_retail.csv')

In [3]:
retails.head()
# print(len(retails))

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [4]:
# transforming raw text into IDs of items
retails['DescriptionID'] = retails['Description'].astype('category').cat.codes

# aggregating each transaction as set of items for each Invoice
retails_with_ids = retails.groupby('InvoiceNo')['DescriptionID'].apply(set).reset_index()
# retails_with_ids.head()

#Apriori for rule mining

In [6]:
from mlxtend.frequent_patterns import apriori, association_rules

# Convert transactions to a DataFrame
itemset = set(item for transaction in retails_with_ids for item in transaction)
data_dict = [{item: (item in transaction) for item in itemset} for transaction in retails_with_ids]
df = pd.DataFrame(data_dict)

# Apply Apriori algorithm
min_support = 0.01  # Minimum support threshold
frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)



In [7]:
# Generate association rules
min_confidence = 0.01  # Minimum confidence threshold
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=min_confidence)

print("Frequent Itemsets:")
print(frequent_itemsets)
print("\nAssociation Rules:")
print(rules)

Frequent Itemsets:
      support                           itemsets
0         0.5                                (v)
1         0.5                                (D)
2         0.5                                (p)
3         1.0                                (I)
4         0.5                                (t)
...       ...                                ...
2234      0.5     (D, I, e, o, c, n, r, p, s, i)
2235      0.5     (D, t, e, o, c, n, r, p, s, i)
2236      0.5     (D, I, t, e, o, c, n, r, s, i)
2237      0.5     (I, t, e, o, c, n, r, p, s, i)
2238      0.5  (D, I, t, e, o, c, n, r, p, s, i)

[2239 rows x 2 columns]

Association Rules:
       antecedents                     consequents  antecedent support  \
0              (v)                             (I)                 0.5   
1              (I)                             (v)                 1.0   
2              (v)                             (e)                 0.5   
3              (e)                             (v)  

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [13]:
# Filter interesting rules
interesting_rules = rules[(rules['confidence'] >= 0.8) & (rules['lift'] > 1.2) & (rules['support'] > 0.5)]
print("\nMost Interesting Association Rules:")
print(interesting_rules.head(100))



Most Interesting Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [None]:
# convert dataset into novice-item table. just like done in recommendation systems.
df_onehot = pd.crosstab(retails['InvoiceNo'], retails['DescriptionID'])

# Rename columns to include "Item_" prefix for clarity
df_onehot.columns = [f'Item_{col}' for col in df_onehot.columns]


# truncate dataset for better time complexity experiment
# df_onehot = df_onehot.iloc[:5000]

In [None]:
df_onehot.head(40)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

item_sim_matrix = cosine_similarity(df_onehot.T)  # Item-item similarity
item_sim_df = pd.DataFrame(item_sim_matrix, index=df_onehot.columns, columns=df_onehot.columns)


In [None]:
# perform item-based CF for rule mining
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_item_ratings(invoice_items, item_sim_df):
    """
    Calculate predicted rating for items not in the invoice.
    Rating(Item X) = sum(sim(Item X, item_in_invoice)) / sum(similarities)
    """
    scores = {}
    norm_factor = 0

    for item in invoice_items:
        similar_items = item_sim_df[item].drop(index=invoice_items, errors="ignore")  # Remove existing items
        for sim_item, similarity in similar_items.items():
            scores[sim_item] = scores.get(sim_item, 0) + similarity  # Aggregate similarity scores
            norm_factor += similarity  # Normalize by sum of similarities

    if norm_factor == 0:
        return {}

    # Normalize scores
    for item in scores:
        scores[item] /= norm_factor

    return scores

In [None]:
from tqdm.notebook import tqdm
threshold = 0.2  # Min rating for an item to be considered

rules = []
for invoice in tqdm(df_onehot.index,desc="Processing Raiting"):
    existing_items = df_onehot.columns[df_onehot.loc[invoice] == 1].tolist()
    predicted_ratings = compute_item_ratings(existing_items, item_sim_df)

    # Select items with rating above threshold
    recommended_items = {item for item, score in predicted_ratings.items() if score > threshold}

    if recommended_items:
        rules.append((set(existing_items), recommended_items))  # Format: {A, B, C} → {D, F}

rules_df = pd.DataFrame(rules, columns=["Antecedent", "Consequent"])


In [None]:
# compute Confidence and Lift
def compute_rule_metrics(rules_df, df_onehot):
    total_invoices = len(df_onehot)
    metrics = []

    for _, row in rules_df.iterrows():
        antecedent = row["Antecedent"]
        consequent = row["Consequent"]

        support_A = (df_onehot[list(antecedent)].sum(axis=1) == len(antecedent)).mean()
        support_B = (df_onehot[list(consequent)].sum(axis=1) == len(consequent)).mean()
        support_A_B = ((df_onehot[list(antecedent)].sum(axis=1) == len(antecedent)) &
                       (df_onehot[list(consequent)].sum(axis=1) == len(consequent))).mean()

        confidence = support_A_B / support_A if support_A > 0 else 0
        lift = confidence / support_B if support_B > 0 else 0
        interestingness = confidence * lift  # Tradeoff metric

        metrics.append((antecedent, consequent, confidence, lift, interestingness))

    return pd.DataFrame(metrics, columns=["Antecedent", "Consequent", "Confidence", "Lift", "Interestingness"])

rules_eval_df = compute_rule_metrics(rules_df, df_onehot)




In [None]:
# Select Top 100 Most Interesting Rules
top_100_rules = rules_eval_df.sort_values(by="Interestingness", ascending=False).head(100)
print(top_100_rules)