In [None]:
# TP1: ASSOCIATION RULES (Market Basket Analysis)
# Purpose: Find patterns in transactional data (which products are bought together)
# Algorithm: Apriori algorithm for frequent itemsets and association rules

# Import necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
from mlxtend.frequent_patterns import apriori, association_rules  # For market basket analysis
import warnings
warnings.filterwarnings('ignore')  # Suppress warning messages


In [None]:
# STEP 1: Load the dataset
# The dataset should contain transactions with products (boolean values: True=bought, False=not bought)
dataset = pd.read_csv(r'C:\Users\ASUS\Downloads\produits.csv')
dataset  # Display the dataset to verify loading


In [5]:
# STEP 2: Convert data to boolean format
# Apriori algorithm requires boolean values (True/False)
# This converts all values to boolean (0->False, 1->True)
dataset = dataset.astype('bool')
dataset  # Display converted dataset


Unnamed: 0,B,P,S,C,E
0,True,False,True,True,True
1,False,True,False,True,False
2,False,False,True,True,True
3,True,False,False,False,True
4,False,True,True,True,False


In [20]:
# STEP 3: Apply Apriori algorithm to find frequent itemsets
# min_support=0.4 means we want itemsets that appear in at least 40% of transactions
# use_colnames=True keeps the original column names (product names)
frequent_itemsets = apriori(dataset, min_support=0.4, use_colnames=True)
# Add a column showing the length (number of items) in each itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)
frequent_itemsets  # Display all frequent itemsets with their support values


Unnamed: 0,support,itemsets,length
0,0.4,(B),1
1,0.4,(P),1
2,0.6,(S),1
3,0.8,(C),1
4,0.6,(E),1
5,0.4,"(B, E)",2
6,0.4,"(C, P)",2
7,0.6,"(C, S)",2
8,0.4,"(E, S)",2
9,0.4,"(C, E)",2


In [19]:
# STEP 4: Filter frequent itemsets with exactly 2 items
# Useful to see which pairs of products are frequently bought together
frequent_itemsets[frequent_itemsets['length'] == 2]


Unnamed: 0,support,itemsets,length
5,0.4,"(B, E)",2
6,0.4,"(C, P)",2
7,0.6,"(C, S)",2
8,0.4,"(E, S)",2
9,0.4,"(C, E)",2


In [21]:
# STEP 5: Filter frequent itemsets with 3 or more items
# Shows larger combinations of products bought together
frequent_itemsets[frequent_itemsets['length'] >= 3]


Unnamed: 0,support,itemsets,length
10,0.4,"(C, E, S)",3


In [31]:
# STEP 6: Filter itemsets containing a specific product (here 'C')
# ge() checks if the itemset contains the specified product
# This finds all itemsets of length >= 2 that include product 'C'
frequent_itemsets[(frequent_itemsets['length'] >= 2) & (frequent_itemsets['itemsets'].ge({'C'}))]


Unnamed: 0,support,itemsets,length
6,0.4,"(C, P)",2
7,0.6,"(C, S)",2
9,0.4,"(C, E)",2
10,0.4,"(C, E, S)",3


In [33]:
# STEP 7: Generate association rules with minimum confidence threshold
# confidence >= 0.6 means: if antecedent is bought, consequence is bought 60% of the time
# Rules show patterns like: "If customer buys A, they will buy B"
rules1 = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules1  # Display all rules meeting the confidence threshold


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(B),(E),0.4,0.6,0.4,1.0,1.666667,1.0,0.16,inf,0.666667,0.666667,1.0,0.833333
1,(E),(B),0.6,0.4,0.4,0.666667,1.666667,1.0,0.16,1.8,1.0,0.666667,0.444444,0.833333
2,(P),(C),0.4,0.8,0.4,1.0,1.25,1.0,0.08,inf,0.333333,0.5,1.0,0.75
3,(C),(S),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875
4,(S),(C),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
5,(E),(S),0.6,0.6,0.4,0.666667,1.111111,1.0,0.04,1.2,0.25,0.5,0.166667,0.666667
6,(S),(E),0.6,0.6,0.4,0.666667,1.111111,1.0,0.04,1.2,0.25,0.5,0.166667,0.666667
7,(E),(C),0.6,0.8,0.4,0.666667,0.833333,1.0,-0.08,0.6,-0.333333,0.4,-0.666667,0.583333
8,"(C, E)",(S),0.4,0.6,0.4,1.0,1.666667,1.0,0.16,inf,0.666667,0.666667,1.0,0.833333
9,"(C, S)",(E),0.6,0.6,0.4,0.666667,1.111111,1.0,0.04,1.2,0.25,0.5,0.166667,0.666667


In [36]:
# STEP 8: Filter rules with lift > 1
# Lift > 1 means the items are positively correlated (bought together more than by chance)
# Lift = 1 means independence (no relationship)
# Lift < 1 means negative correlation (bought together less than by chance)
rules2 = rules1[rules1['lift'] > 1].reset_index()
rules2  # Display interesting rules (positively correlated)


Unnamed: 0,index,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,0,(B),(E),0.4,0.6,0.4,1.0,1.666667,1.0,0.16,inf,0.666667,0.666667,1.0,0.833333
1,1,(E),(B),0.6,0.4,0.4,0.666667,1.666667,1.0,0.16,1.8,1.0,0.666667,0.444444,0.833333
2,2,(P),(C),0.4,0.8,0.4,1.0,1.25,1.0,0.08,inf,0.333333,0.5,1.0,0.75
3,3,(C),(S),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875
4,4,(S),(C),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
5,5,(E),(S),0.6,0.6,0.4,0.666667,1.111111,1.0,0.04,1.2,0.25,0.5,0.166667,0.666667
6,6,(S),(E),0.6,0.6,0.4,0.666667,1.111111,1.0,0.04,1.2,0.25,0.5,0.166667,0.666667
7,8,"(C, E)",(S),0.4,0.6,0.4,1.0,1.666667,1.0,0.16,inf,0.666667,0.666667,1.0,0.833333
8,9,"(C, S)",(E),0.6,0.6,0.4,0.666667,1.111111,1.0,0.04,1.2,0.25,0.5,0.166667,0.666667
9,10,"(E, S)",(C),0.4,0.8,0.4,1.0,1.25,1.0,0.08,inf,0.333333,0.5,1.0,0.75


In [40]:
# STEP 9: Get the top 4 rules with highest lift
# Sort by lift in descending order and take the first 4 rules
# These are the strongest associations in the data
rules3 = rules2.sort_values(by='lift', ascending=False)[:4]
rules3


Unnamed: 0,index,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,0,(B),(E),0.4,0.6,0.4,1.0,1.666667,1.0,0.16,inf,0.666667,0.666667,1.0,0.833333
1,1,(E),(B),0.6,0.4,0.4,0.666667,1.666667,1.0,0.16,1.8,1.0,0.666667,0.444444,0.833333
7,8,"(C, E)",(S),0.4,0.6,0.4,1.0,1.666667,1.0,0.16,inf,0.666667,0.666667,1.0,0.833333
11,12,(S),"(C, E)",0.6,0.4,0.4,0.666667,1.666667,1.0,0.16,1.8,1.0,0.666667,0.444444,0.833333


In [None]:
# STEP 10: Filter specific rules
# Find rules where antecedent contains 'E' AND consequent contains 'S'
# Example: "If customer buys E, they will buy S"
rules3[(rules3[('antecedents')].ge({'E'})) & (rules3[('consequents')].ge({'S'}))]


In [None]:
# STEP 11: Find optimal minimum support value
# Try different support values from 1.0 down to 0.1 to find smallest support
# that still generates at least 2 interesting rules (lift > 1)
# This helps determine the minimum support threshold for meaningful patterns
c = np.linspace(1, 0.1, 10)  # Create 10 values from 1.0 to 0.1
for i in c:
    try:
     # Try to generate rules with current support threshold
     rules = association_rules(apriori(dataset, min_support=i, use_colnames=True), metric="lift", min_threshold=1)
     if len(rules) >= 2:  # If we found at least 2 rules
        print("S_min={} ".format(i))  # Print the minimum support value
        break  # Stop searching
    except:
        print("i{}, NO_FREQUENT_ITEMSET".format(i))  # No frequent itemsets found
    else:
        print("i{}, NB_INTRESTING_RULES={}".format(i,len(rules)))  # Print number of rules found
