In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = "Output/processed_groceries_data.csv"
transactions_df = pd.read_csv(file_path)

In [3]:
one_hot = transactions_df['Transactions'].str.get_dummies(', ').astype(bool)


In [4]:
train_data, test_data = train_test_split(one_hot, test_size=0.2, random_state=42)


In [5]:
min_support = 0.0005    # Lower min_support for Apriori


In [6]:
frequent_itemsets_apriori = apriori(train_data, min_support=min_support, use_colnames=True, low_memory=True)
frequent_itemsets_apriori['support_count'] = frequent_itemsets_apriori['support'] * len(train_data)

In [7]:
print("Frequent Itemsets (Apriori):\n", frequent_itemsets_apriori.head())


Frequent Itemsets (Apriori):
     support                    itemsets  support_count
0  0.002757  ('Instant food products'])           33.0
1  0.005096                ('UHT-milk')           61.0
2  0.008020               ('UHT-milk'])           96.0
3  0.000835       ('abrasive cleaner'])           10.0
4  0.000501        ('artif. sweetener')            6.0


In [8]:
rules_apriori = association_rules(frequent_itemsets_apriori, metric="lift", min_threshold=1.0, num_itemsets=2)
print("Apriori Rules:\n", rules_apriori.head())

Apriori Rules:
            antecedents          consequents  antecedent support  \
0         ('UHT-milk')  (['tropical fruit')            0.005096   
1  (['tropical fruit')         ('UHT-milk')            0.044194   
2        ('UHT-milk'])   (['bottled water')            0.008020   
3   (['bottled water')        ('UHT-milk'])            0.019967   
4        ('UHT-milk'])         (['sausage')            0.008020   

   consequent support   support  confidence     lift  representativity  \
0            0.044194  0.000585    0.114754  2.59661               1.0   
1            0.005096  0.000585    0.013233  2.59661               1.0   
2            0.019967  0.000501    0.062500  3.13023               1.0   
3            0.008020  0.000501    0.025105  3.13023               1.0   
4            0.049624  0.000501    0.062500  1.25947               1.0   

   leverage  conviction  zhangs_metric   jaccard  certainty  kulczynski  
0  0.000360    1.079707       0.618032  0.012007   0.073823   

In [9]:
# FP-Growth method
frequent_itemsets_fp_growth = fpgrowth(train_data, min_support=min_support, use_colnames=True)
frequent_itemsets_fp_growth['support_count'] = frequent_itemsets_fp_growth['support'] * len(train_data)


In [10]:
print("Frequent Itemsets (FP-Growth):\n", frequent_itemsets_fp_growth.head())


Frequent Itemsets (FP-Growth):
     support               itemsets  support_count
0  0.055138  (['other vegetables')          660.0
1  0.047536        ('whole milk'])          569.0
2  0.036675      (['citrus fruit')          439.0
3  0.017043   ('root vegetables'])          204.0
4  0.069674        (['whole milk')          834.0


In [11]:
rules_fp_growth = association_rules(frequent_itemsets_fp_growth, metric="lift", min_threshold=1.0, num_itemsets=2)
print("FP-Growth Rules:\n", rules_fp_growth.head())

FP-Growth Rules:
              antecedents            consequents  antecedent support  \
0  (['other vegetables')        ('whole milk'])            0.055138   
1        ('whole milk'])  (['other vegetables')            0.047536   
2           (['sausage')        ('whole milk'])            0.049624   
3        ('whole milk'])           (['sausage')            0.047536   
4      (['citrus fruit')        ('rolls/buns'])            0.036675   

   consequent support   support  confidence      lift  representativity  \
0            0.047536  0.003759    0.068182  1.434335               1.0   
1            0.055138  0.003759    0.079086  1.434335               1.0   
2            0.047536  0.003258    0.065657  1.381211               1.0   
3            0.049624  0.003258    0.068541  1.381211               1.0   
4            0.046366  0.001921    0.052392  1.129964               1.0   

   leverage  conviction  zhangs_metric   jaccard  certainty  kulczynski  
0  0.001138    1.022157       

In [29]:
print(f"Apriori Total Rules: {len(rules_apriori)}")
print(f"FP-Growth Total Rules: {len(rules_fp_growth)}")

Apriori Total Rules: 1544
FP-Growth Total Rules: 1544


In [30]:
rules_apriori['lift'] = pd.to_numeric(rules_apriori['lift'], errors='coerce')
rules_apriori['confidence'] = pd.to_numeric(rules_apriori['confidence'], errors='coerce')

rules_fp_growth['lift'] = pd.to_numeric(rules_fp_growth['lift'], errors='coerce')
rules_fp_growth['confidence'] = pd.to_numeric(rules_fp_growth['confidence'], errors='coerce')



In [31]:
rules_apriori['LHS'] = rules_apriori['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_apriori['RHS'] = rules_apriori['consequents'].apply(lambda x: ', '.join(list(x)))

rules_fp_growth['LHS'] = rules_fp_growth['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_fp_growth['RHS'] = rules_fp_growth['consequents'].apply(lambda x: ', '.join(list(x)))



In [37]:
top_apriori_by_lift = rules_apriori.nlargest(10, 'lift').head(10)
top_fpgrowth_by_lift = rules_fp_growth.nlargest(10, 'lift').head(10)

In [38]:
if len(top_apriori_by_lift) > 0:
    plt.figure(figsize=(12, 8))  # Adjust figure size
    sns.barplot(data=top_apriori_by_lift, x='lift', y='RHS', color='skyblue')
    plt.title('Top 10 Apriori Rules by Lift')
    plt.xlabel('Lift')
    plt.ylabel('Consequents (RHS)')
    
    # Annotate with LHS, Support, and Confidence
    for index, row in top_apriori_by_lift.iterrows():
        plt.text(row['lift'] + 0.01, index, 
                 f"LHS: {row['LHS']}, Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}", 
                 color='black', ha="left", va="center", fontsize=10)
    plt.tight_layout()
    plt.show()

  plt.tight_layout()


ValueError: Image size of 1548x105214 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 1200x800 with 1 Axes>

In [39]:
if len(top_fpgrowth_by_lift) > 0:
    plt.figure(figsize=(12, 8))  # Adjust figure size
    sns.barplot(data=top_fpgrowth_by_lift, x='lift', y='RHS', color='lightcoral')
    plt.title('Top 10 FP-Growth Rules by Lift')
    plt.xlabel('Lift')
    plt.ylabel('Consequents (RHS)')
    
    # Annotate with LHS, Support, and Confidence
    for index, row in top_fpgrowth_by_lift.iterrows():
        plt.text(row['lift'] + 0.01, index, 
                 f"LHS: {row['LHS']}, Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}", 
                 color='black', ha="left", va="center", fontsize=10)
    plt.tight_layout()
    plt.show()

  plt.tight_layout()


ValueError: Image size of 1548x90635 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 1200x800 with 1 Axes>