In [1]:
# Extraction des données depuis note fichier texte
import pandas as pd
dataset = pd.read_table("market_basket.txt", delimiter="\t")

In [2]:
# Affichage des premières dix lignes
dataset.head(10)

Unnamed: 0,ID,Product
0,1,Peaches
1,2,Vegetable_Oil
2,2,Frozen_Corn
3,3,Plums
4,4,Pancake_Mix
5,5,Cheese
6,6,Cauliflower
7,7,2pct_Milk
8,8,98pct_Fat_Free_Hamburger
9,8,Potato_Chips


In [3]:
dataset.shape

(12935, 2)

In [4]:
# Ici on transforme notre data sous forme d'une table binaire avec les id des tarnsactions en lignes et les produits en colonnes
df = pd.crosstab(dataset.ID, dataset.Product)
df.iloc[:30,:7]

Product,100_Watt_Lightbulb,2pct_Milk,40_Watt_Lightbulb,60_Watt_Lightbulb,75_Watt_Lightbulb,98pct_Fat_Free_Hamburger,AA_Cell_Batteries
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0
8,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0


In [5]:
df.shape

(1360, 303)

In [6]:
# Affichage des colonnes
print(df.columns)

Index(['100_Watt_Lightbulb', '2pct_Milk', '40_Watt_Lightbulb',
       '60_Watt_Lightbulb', '75_Watt_Lightbulb', '98pct_Fat_Free_Hamburger',
       'AA_Cell_Batteries', 'Apple_Cinnamon_Waffles', 'Apple_Drink',
       'Apple_Fruit_Roll',
       ...
       'White_Bread', 'White_Wine', 'White_Zinfandel_Wine', 'Whole_Corn',
       'Whole_Green_Beans', 'Whole_Milk', 'Window_Cleaner', 'Wood_Polish',
       'flav_Fruit_Bars', 'flav_Ice'],
      dtype='object', name='Product', length=303)


In [7]:
# Importation de la fonction apriori
from mlxtend.frequent_patterns import apriori

In [8]:
# itemsets_frequents
freq_itemsets = apriori(df, min_support=0.025, max_len=4, use_colnames=True)

In [9]:
type(freq_itemsets)

pandas.core.frame.DataFrame

In [10]:
freq_itemsets.columns

Index(['support', 'itemsets'], dtype='object')

In [11]:
freq_itemsets.shape

(603, 2)

In [12]:
freq_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.030147,(100_Watt_Lightbulb)
1,0.109559,(2pct_Milk)
2,0.0375,(60_Watt_Lightbulb)
3,0.031618,(75_Watt_Lightbulb)
4,0.093382,(98pct_Fat_Free_Hamburger)
5,0.031618,(AA_Cell_Batteries)
6,0.025735,(Apple_Cinnamon_Waffles)
7,0.026471,(Apple_Drink)
8,0.031618,(Apple_Fruit_Roll)
9,0.032353,(Apple_Jam)


In [13]:
type(freq_itemsets.itemsets)

pandas.core.series.Series

In [14]:
#accès indexé au premier élément
print(freq_itemsets.itemsets[0])

frozenset({'100_Watt_Lightbulb'})


Un itemset correspond à un objet frozenset de Python, un type ensemble (set) qui n’est pas
modifiable (on dit non-mutable). Plusieurs solutions s‘offrent à nous pour la recherche d’itemsets
répondant à des conditions de présence d’items.

## Solution 1 : Utilisation de la méthode apply() de pandas.series

In [15]:
#fonction de test d'inclusion : 
def is_inclus(x,items):
 return items.issubset(x)

In [16]:
#recherche des index des itemsets correspondant à une condition
import numpy as np
id = np.where(freq_itemsets.itemsets.apply(is_inclus,items={'Aspirin'}))
print(id)

(array([ 13, 208, 249, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
       282, 283, 284, 285, 286, 509, 510, 511, 552, 553, 554, 555, 556],
      dtype=int64),)


In [17]:
#affichage des itemsets corresp.
print(freq_itemsets.loc[id])

      support                              itemsets
13   0.066912                             (Aspirin)
208  0.034559                  (2pct_Milk, Aspirin)
249  0.027941   (98pct_Fat_Free_Hamburger, Aspirin)
272  0.027206                       (Cola, Aspirin)
273  0.025735              (Aspirin, Domestic_Beer)
274  0.038235                       (Eggs, Aspirin)
275  0.027206                   (Hot_Dogs, Aspirin)
276  0.027206                     (Onions, Aspirin)
277  0.027206   (Pepperoni_Pizza_-_Frozen, Aspirin)
278  0.025000               (Popcorn_Salt, Aspirin)
279  0.036029               (Aspirin, Potato_Chips)
280  0.030147                   (Aspirin, Potatoes)
281  0.030147               (Aspirin, Sweet_Relish)
282  0.028676               (Aspirin, Toilet_Paper)
283  0.025000                   (Tomatoes, Aspirin)
284  0.030882                 (Toothpaste, Aspirin)
285  0.025000                (Aspirin, Wheat_Bread)
286  0.041912                (Aspirin, White_Bread)
509  0.02573

Pour les impatients, nous pouvons définir à la volée la fonction de comparaison lors de l’appel de
la méthode apply() de Series via le mécanisme des fonctions anonymes lambda de Python.

In [18]:
#passer par une fonction lambda si on est préssé
np.where(freq_itemsets.itemsets.apply(lambda x,ensemble:ensemble.issubset(x),ensemble={'Aspirin'}))

(array([ 13, 208, 249, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
        282, 283, 284, 285, 286, 509, 510, 511, 552, 553, 554, 555, 556],
       dtype=int64),)

## Solution 2 : Utilisation des opérateurs de comparaison de pandas.Series

On va utiliser l'une des fonctions de comparaison vectorisées (Binary operators functions) :

In [19]:
#itemsets contenant Aspirin - passer par les méthodes natives de Series
print(freq_itemsets[freq_itemsets['itemsets'].ge({'Aspirin'})])

      support                              itemsets
13   0.066912                             (Aspirin)
208  0.034559                  (2pct_Milk, Aspirin)
249  0.027941   (98pct_Fat_Free_Hamburger, Aspirin)
272  0.027206                       (Cola, Aspirin)
273  0.025735              (Aspirin, Domestic_Beer)
274  0.038235                       (Eggs, Aspirin)
275  0.027206                   (Hot_Dogs, Aspirin)
276  0.027206                     (Onions, Aspirin)
277  0.027206   (Pepperoni_Pizza_-_Frozen, Aspirin)
278  0.025000               (Popcorn_Salt, Aspirin)
279  0.036029               (Aspirin, Potato_Chips)
280  0.030147                   (Aspirin, Potatoes)
281  0.030147               (Aspirin, Sweet_Relish)
282  0.028676               (Aspirin, Toilet_Paper)
283  0.025000                   (Tomatoes, Aspirin)
284  0.030882                 (Toothpaste, Aspirin)
285  0.025000                (Aspirin, Wheat_Bread)
286  0.041912                (Aspirin, White_Bread)
509  0.02573

In [20]:
#itemsets avec Aspirin
print(freq_itemsets[freq_itemsets['itemsets'].eq({'Aspirin'})])

     support   itemsets
13  0.066912  (Aspirin)


In [21]:
#itemsets contenant Aspirin et Eggs
print(freq_itemsets[freq_itemsets['itemsets'].ge({'Aspirin','Eggs'})])

      support                       itemsets
274  0.038235                (Eggs, Aspirin)
509  0.025735     (Eggs, 2pct_Milk, Aspirin)
552  0.025000  (Eggs, Aspirin, Potato_Chips)
553  0.029412   (Eggs, Aspirin, White_Bread)


In [22]:
#itemsets contenant Aspirin et Eggs
print(freq_itemsets[freq_itemsets['itemsets'].ge({'Eggs','Aspirin'})])

      support                       itemsets
274  0.038235                (Eggs, Aspirin)
509  0.025735     (Eggs, 2pct_Milk, Aspirin)
552  0.025000  (Eggs, Aspirin, Potato_Chips)
553  0.029412   (Eggs, Aspirin, White_Bread)


Nous obtenons le bon résultat, de manière autrement plus simple finalement.

## Extraction et déduction des règles d'association :

Après le travail avec la fonction Apriori pour détecter les itemsets fréquents, on passe pour détecter nos règles d'association, à travers la confiance.

La confiance doit dépasser la confiance minimale :

In [23]:
#fonction de calcul des règles
from mlxtend.frequent_patterns import association_rules

#génération des règles à partir des itemsets fréquents
regles = association_rules(freq_itemsets,metric="confidence",min_threshold=0.75)

In [24]:
#type de l'objet renvoyé
type(regles)

pandas.core.frame.DataFrame

In [25]:
#dimension
regles.shape

(50, 9)

In [26]:
#liste des colonnes
regles.columns

Index(['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction'],
      dtype='object')

In [27]:
#Affichage des 7 "premières" règles
regles.iloc[:7,:]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(2pct_Milk, Aspirin)",(White_Bread),0.034559,0.119118,0.027206,0.787234,6.608878,0.023089,4.140147
1,"(2pct_Milk, Bananas)",(White_Bread),0.031618,0.119118,0.025735,0.813953,6.83319,0.021969,4.734743
2,"(Bananas, White_Bread)",(2pct_Milk),0.032353,0.109559,0.025735,0.795455,7.260525,0.022191,4.353268
3,"(Cola, Wheat_Bread)",(2pct_Milk),0.032353,0.109559,0.025735,0.795455,7.260525,0.022191,4.353268
4,"(Popcorn_Salt, 2pct_Milk)",(Eggs),0.033088,0.122794,0.027206,0.822222,6.695941,0.023143,4.934283
5,"(2pct_Milk, Potato_Chips)",(Eggs),0.044853,0.122794,0.033824,0.754098,6.14116,0.028316,3.567304
6,"(2pct_Milk, Tomatoes)",(Eggs),0.032353,0.122794,0.025735,0.795455,6.477953,0.021763,4.288562


In [28]:
#règles en restrieignant l'affichage à qqs colonnes
myRegles = regles.loc[:,['antecedents','consequents','lift']]
myRegles.shape

(50, 3)

In [29]:
#pour afficher toutes les colonnes
pd.set_option('display.max_columns',5)
pd.set_option('precision',3)

In [30]:
#affichage des 5 premières règles
myRegles[:5]

Unnamed: 0,antecedents,consequents,lift
0,"(2pct_Milk, Aspirin)",(White_Bread),6.609
1,"(2pct_Milk, Bananas)",(White_Bread),6.833
2,"(Bananas, White_Bread)",(2pct_Milk),7.261
3,"(Cola, Wheat_Bread)",(2pct_Milk),7.261
4,"(Popcorn_Salt, 2pct_Milk)",(Eggs),6.696


In [31]:
#affichage des règles avec un LIFT supérieur ou égal à 7
myRegles[myRegles['lift'].ge(7.0)]

Unnamed: 0,antecedents,consequents,lift
2,"(Bananas, White_Bread)",(2pct_Milk),7.261
3,"(Cola, Wheat_Bread)",(2pct_Milk),7.261
8,"(Onions, Wheat_Bread)",(2pct_Milk),7.574
10,"(Wheat_Bread, Potatoes)",(2pct_Milk),7.053
13,"(Toothpaste, Wheat_Bread)",(2pct_Milk),7.38
16,"(Hamburger_Buns, White_Bread)",(98pct_Fat_Free_Hamburger),8.202
17,"(98pct_Fat_Free_Hamburger, Wheat_Bread)",(White_Bread),7.556
29,"(Hot_Dog_Buns, Sweet_Relish)",(Hot_Dogs),9.031
35,"(Toilet_Paper, Potatoes)",(White_Bread),7.319
37,"(Toothpaste, Toilet_Paper)",(White_Bread),7.346


In [32]:
#trier les règles dans l'ordre du lift décroissants - 10 meilleurs règles
myRegles.sort_values(by='lift',ascending=False)[:10]

Unnamed: 0,antecedents,consequents,lift
49,"(2pct_Milk, Potato_Chips, White_Bread)",(Toothpaste),9.514
29,"(Hot_Dog_Buns, Sweet_Relish)",(Hot_Dogs),9.031
16,"(Hamburger_Buns, White_Bread)",(98pct_Fat_Free_Hamburger),8.202
47,"(Toothpaste, 2pct_Milk, White_Bread)",(Potato_Chips),7.726
8,"(Onions, Wheat_Bread)",(2pct_Milk),7.574
48,"(Toothpaste, Potato_Chips, White_Bread)",(2pct_Milk),7.569
17,"(98pct_Fat_Free_Hamburger, Wheat_Bread)",(White_Bread),7.556
13,"(Toothpaste, Wheat_Bread)",(2pct_Milk),7.38
37,"(Toothpaste, Toilet_Paper)",(White_Bread),7.346
46,"(Toothpaste, Potato_Chips, 2pct_Milk)",(White_Bread),7.319


In [33]:
#filtrer les règles menant à 2pct_milk
myRegles[myRegles['consequents'].eq({'2pct_Milk'})]

Unnamed: 0,antecedents,consequents,lift
2,"(Bananas, White_Bread)",(2pct_Milk),7.261
3,"(Cola, Wheat_Bread)",(2pct_Milk),7.261
7,"(Eggs, Wheat_Bread)",(2pct_Milk),6.97
8,"(Onions, Wheat_Bread)",(2pct_Milk),7.574
9,"(Toothpaste, Potato_Chips)",(2pct_Milk),6.98
10,"(Wheat_Bread, Potatoes)",(2pct_Milk),7.053
13,"(Toothpaste, Wheat_Bread)",(2pct_Milk),7.38
41,"(Eggs, Potato_Chips, White_Bread)",(2pct_Milk),7.143
44,"(Eggs, Toothpaste, White_Bread)",(2pct_Milk),7.261
48,"(Toothpaste, Potato_Chips, White_Bread)",(2pct_Milk),7.569


In [34]:
#filtrer les règles contenant 'Aspirin' dans l'antécédent
myRegles[myRegles['antecedents'].ge({'Aspirin'})]

Unnamed: 0,antecedents,consequents,lift
0,"(2pct_Milk, Aspirin)",(White_Bread),6.609
18,"(Eggs, Aspirin)",(White_Bread),6.458
19,"(Aspirin, Potato_Chips)",(White_Bread),6.339
20,"(Aspirin, Potatoes)",(White_Bread),6.962
21,"(Toothpaste, Aspirin)",(White_Bread),6.996
