In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Load the groceries CSV file
groceries_df = pd.read_csv('groceries - groceries.csv')
groceries_df.head()

In [None]:
# Preprocess the data into a list of transactions
transactions = groceries_df.drop(columns=['Item(s)']).values.tolist()

# Remove NaN values from each transaction
transactions = [[item for item in transaction if pd.notna(item)] for transaction in transactions]

# Display the first 5 transactions
transactions[:5]

In [None]:
!pip install -q mlxtend

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

# One-hot encode the transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Display the frequent itemsets
frequent_itemsets = frequent_itemsets.dropna()
frequent_itemsets.sort_values(by='support', ascending=False)

The support value indicates the proportion of transactions that contain the given itemset. For instance, "soda" appears in approximately 17.44% of all transactions.

In [None]:
from mlxtend.frequent_patterns import association_rules

# Compute association rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

# Sort rules by lift in descending order and display the top 10 rules
rules.sort_values(by='lift', ascending=False)[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

To identify which items are frequently bought together. We used the metrics like confidence and lift to evaluate the strength of these rules. 

Support: This metric gives the fraction of transactions that contain the itemset. For instance, the combination of "root vegetables" and "other vegetables, tropical fruit" appears in about 1.23% of all transactions.

Confidence: This metric indicates the likelihood of item Y being purchased when item X is purchased. For example, when "whole milk" and "yogurt" are purchased, there's a 17.97% chance that "curd" will also be purchased.

Lift: This metric measures how much more likely item Y is purchased when item X is purchased, compared to when item Y is purchased randomly. A lift value greater than 1 indicates that the items are likely to be bought together.

In [None]:
# Compute association rules based on leverage and conviction metrics
rules_leverage = association_rules(frequent_itemsets_frozenset, metric='leverage', min_threshold=0)
rules_conviction = association_rules(frequent_itemsets_frozenset, metric='conviction', min_threshold=0)

# Concatenate all the rules dataframes
all_rules = pd.concat([rules, rules_leverage, rules_conviction], axis=0).drop_duplicates()

# Display the top 10 rules from the combined dataset
all_rules.sort_values(by='lift', ascending=False)

Leverage: It measures the difference between the observed frequency of A and B occurring together and the frequency that would be expected if A and B were independent. A leverage value of 0 indicates independence.

Conviction: A high conviction value means that the consequent is highly dependent on the antecedent. For instance, a conviction value of 1.5 means that the rule would be incorrect 50% more often if the association between the antecedent and the consequent was purely random chance.

Zhang's metric is a measure of the deviation of the observed frequency of co-occurrence of the antecedent and consequent from the expected frequency if they were independent

A value close to 1 indicates a strong positive association.

A value close to -1 indicates a strong negative association.

A value close to 0 indicates no significant association.

In [None]:
# Extract unique items from the top 20 rules for a cleaner visualization
top_items = list(set(all_rules.head(20)['antecedents'].explode().to_list() + all_rules.head(20)['consequents'].explode().to_list()))

# Map the antecedents and consequents to their respective numerical positions for the top 20 rules
all_rules['FromN'] = all_rules['antecedents'].apply(lambda x: top_items.index(list(x)[0]) if list(x)[0] in top_items else None)
all_rules['ToN'] = all_rules['consequents'].apply(lambda x: top_items.index(list(x)[0]) if list(x)[0] in top_items else None)

# Filter out rules that don't have both antecedents and consequents in the top items
filtered_rules = all_rules.dropna(subset=['FromN', 'ToN'])

# Plot the graph
plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
fig, ax = plt.subplots(figsize=(15, 7))
for index, row in filtered_rules.iterrows():
    ax.plot([row['FromN'], row['ToN']], [0, 1], 'o-',
             c=plt.cm.viridis(row['support'] * 10),
             markersize=20,
             lw=row['confidence']*10)
norm = plt.Normalize(filtered_rules['support'].min(), filtered_rules['support'].max())
sm = plt.cm.ScalarMappable(cmap='viridis')
sm.set_array([])
cb = plt.colorbar(sm, ax=ax)
cb.set_label('Support*10')
plt.xticks(range(len(top_items)), top_items, rotation='vertical')
plt.yticks([])
plt.show()

Each line connects two items that have a strong association, with the color of the line representing the support of the rule (darker colors indicate higher support) and the line width representing the confidence of the rule.

In [None]:
import networkx as nx

# Create a network graph
G = nx.Graph()

# Add nodes and edges based on the association rules
for _, row in all_rules.iterrows():
    G.add_edge(tuple(row['antecedents'])[0], tuple(row['consequents'])[0], weight=row['lift'])

# Plot the network graph
plt.figure(figsize=(20, 20))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=1000)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=12)
plt.title('Item-Item Association Network Graph')
plt.axis('off')
plt.show()