# preprocessing

In [51]:
!git clone https://github.com/ItamarBerger/TabularDS

fatal: destination path 'TabularDS' already exists and is not an empty directory.


In [52]:
# load dataset
import pandas as pd
data = pd.read_csv('/content/TabularDS/final_project/code/data/online_retail.csv')
# data = retails.copy()

In [53]:
# retails.head()
# print(len(retails))

In [54]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data.dropna(subset=['InvoiceNo', 'Description'], inplace=True)

transactions = data.groupby('InvoiceNo')['Description'].apply(list).tolist()

In [None]:
transactions.head()

In [55]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Convert the transaction data into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply Apriori to find frequent itemsets (min_support can be adjusted based on your needs)
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

# Generate association rules (min_threshold can be adjusted for confidence)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.0)

# Display the results
print(rules)


Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [56]:
# transforming raw text into IDs of items
retails['DescriptionID'] = retails['Description'].astype('category').cat.codes

# aggregating each transaction as set of items for each Invoice
retails = retails.groupby('InvoiceNo')['DescriptionID'].apply(set).reset_index()
# retails_with_ids.head()

KeyError: 'Description'

In [None]:
retails.head()

In [None]:
from mlxtend.preprocessing import TransactionEncoder

# Convert sets to lists
transactions = retails["DescriptionID"].apply(list).tolist()  # Convert sets to lists

# Apply TransactionEncoder
encoder = TransactionEncoder()
encoded_array = encoder.fit(transactions).transform(transactions)

# Convert to a DataFrame
df_encoded = pd.DataFrame(encoded_array, columns=encoder.columns_)



In [None]:
print(df_encoded.head())  # Check transformed DataFrame


#Apriori for rule mining

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Convert transactions to a DataFrame
itemset = set(item for transaction in retails for item in transaction)
data_dict = [{item: (item in transaction) for item in itemset} for transaction in retails]
df = pd.DataFrame(data_dict)

# Apply Apriori algorithm
min_support = 0.5  # Minimum support threshold
frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)



In [None]:
# Generate association rules
min_confidence = 0.03  # Minimum confidence threshold
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=min_confidence)

print("Frequent Itemsets:")
print(frequent_itemsets)
print("\nAssociation Rules:")
print(rules)

In [None]:
# Filter interesting rules
interesting_rules = rules[(rules['confidence'] >= 0.8) & (rules['lift'] > 1.2) & (rules['support'])]
print("\nMost Interesting Association Rules:")
print(interesting_rules.head(100))


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

item_sim_matrix = cosine_similarity(df_onehot.T)  # Item-item similarity
item_sim_df = pd.DataFrame(item_sim_matrix, index=df_onehot.columns, columns=df_onehot.columns)


In [None]:
# perform item-based CF for rule mining
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_item_ratings(invoice_items, item_sim_df):
    """
    Calculate predicted rating for items not in the invoice.
    Rating(Item X) = sum(sim(Item X, item_in_invoice)) / sum(similarities)
    """
    scores = {}
    norm_factor = 0

    for item in invoice_items:
        similar_items = item_sim_df[item].drop(index=invoice_items, errors="ignore")  # Remove existing items
        for sim_item, similarity in similar_items.items():
            scores[sim_item] = scores.get(sim_item, 0) + similarity  # Aggregate similarity scores
            norm_factor += similarity  # Normalize by sum of similarities

    if norm_factor == 0:
        return {}

    # Normalize scores
    for item in scores:
        scores[item] /= norm_factor

    return scores

In [None]:
from tqdm.notebook import tqdm
threshold = 0.2  # Min rating for an item to be considered

rules = []
for invoice in tqdm(df_onehot.index,desc="Processing Raiting"):
    existing_items = df_onehot.columns[df_onehot.loc[invoice] == 1].tolist()
    predicted_ratings = compute_item_ratings(existing_items, item_sim_df)

    # Select items with rating above threshold
    recommended_items = {item for item, score in predicted_ratings.items() if score > threshold}

    if recommended_items:
        rules.append((set(existing_items), recommended_items))  # Format: {A, B, C} → {D, F}

rules_df = pd.DataFrame(rules, columns=["Antecedent", "Consequent"])


In [None]:
# compute Confidence and Lift
def compute_rule_metrics(rules_df, df_onehot):
    total_invoices = len(df_onehot)
    metrics = []

    for _, row in rules_df.iterrows():
        antecedent = row["Antecedent"]
        consequent = row["Consequent"]

        support_A = (df_onehot[list(antecedent)].sum(axis=1) == len(antecedent)).mean()
        support_B = (df_onehot[list(consequent)].sum(axis=1) == len(consequent)).mean()
        support_A_B = ((df_onehot[list(antecedent)].sum(axis=1) == len(antecedent)) &
                       (df_onehot[list(consequent)].sum(axis=1) == len(consequent))).mean()

        confidence = support_A_B / support_A if support_A > 0 else 0
        lift = confidence / support_B if support_B > 0 else 0
        interestingness = confidence * lift  # Tradeoff metric

        metrics.append((antecedent, consequent, confidence, lift, interestingness))

    return pd.DataFrame(metrics, columns=["Antecedent", "Consequent", "Confidence", "Lift", "Interestingness"])

rules_eval_df = compute_rule_metrics(rules_df, df_onehot)




In [None]:
# Select Top 100 Most Interesting Rules
top_100_rules = rules_eval_df.sort_values(by="Interestingness", ascending=False).head(100)
print(top_100_rules)