In [None]:
import pandas as pd
import re
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx


In [None]:
# --- Load Dataset ---
# Import the dataset uploaded to the repo
url = ""
data = pd.read_csv(url)

# Inspect first few rows
print(data.head())

In [None]:
# --- Data Preprocessing ---
# Text formatting, remove numerics and punctuation
data = data.lower()
data = re.sub(r'\d+', '', data)
data = re.sub(r'\s+', ' ', data)

# One-hot encode all items 
transactions = data.groupby('Member_number')[
    'itemDescription'].apply(list).values.tolist()

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Inspect
df.head()

In [None]:
# --- Use apriori ---
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

print(frequent_itemsets.head())

In [None]:
# --- Use association_rules ---
rules = association_rules(
    frequent_itemsets, metric="confidence", min_threshold=0.6, min_lift=1.2
    )

print(rules.head())

In [None]:
# --- Display top 5 rules w/ metrics --- 
# Bar Chart
item_frequencies = df.sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=item_frequencies.head(5).values,
            y=item_frequencies.head(5).index)
plt.title('Top 5 Frequent Items')
plt.xlabel('Frequency')
plt.ylabel('Items')
plt.show()

In [None]:
# Scatter Plot of Rules(Support vs Confidence)
plt.figure(figsize=(8, 6))
scatter = plt.scatter(rules['support'], rules['confidence'],
                      c=rules['lift'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Lift')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Scatter Plot of Association Rules')
plt.show()

In [None]:
# Heatmap of Confidence
rules['antecedents_str'] = rules['antecedents'].apply(
    lambda x: ', '.join(list(x)))
rules['consequents_str'] = rules['consequents'].apply(
    lambda x: ', '.join(list(x)))

top_ants = rules.groupby('antecedents_str')['support'].sum().nlargest(5).index
top_cons = rules.groupby('consequents_str')['support'].sum().nlargest(5).index

filtered = rules[(rules['antecedents_str'].isin(top_ants)) &
                 (rules['consequents_str'].isin(top_cons))]

heatmap_data = filtered.pivot(
    index='antecedents_str', columns='consequents_str', values='confidence')

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu',
            linewidths=0.5, cbar_kws={'label': 'Confidence'})
plt.title('Heatmap of Confidence for Top Association Rules')
plt.xlabel('Consequents')
plt.ylabel('Antecedents')
plt.show()