In [1]:
# Extraction des données depuis note fichier texte
import pandas as pd
dataset = pd.read_table("market_basket.txt", delimiter="\t")

In [2]:
# Affichage des premières dix lignes
dataset.head(10)

Unnamed: 0,ID,Product
0,1,Peaches
1,2,Vegetable_Oil
2,2,Frozen_Corn
3,3,Plums
4,4,Pancake_Mix
5,5,Cheese
6,6,Cauliflower
7,7,2pct_Milk
8,8,98pct_Fat_Free_Hamburger
9,8,Potato_Chips


In [3]:
dataset.shape

(12935, 2)

In [4]:
# Ici on transforme notre data sous forme d'une table binaire avec les id des tarnsactions en lignes et les produits en colonnes
df = pd.crosstab(dataset.ID, dataset.Product)
df.iloc[:30,:7]

Product,100_Watt_Lightbulb,2pct_Milk,40_Watt_Lightbulb,60_Watt_Lightbulb,75_Watt_Lightbulb,98pct_Fat_Free_Hamburger,AA_Cell_Batteries
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0
8,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0


In [5]:
df.shape

(1360, 303)

In [6]:
# Affichage des colonnes
print(df.columns)

Index(['100_Watt_Lightbulb', '2pct_Milk', '40_Watt_Lightbulb',
       '60_Watt_Lightbulb', '75_Watt_Lightbulb', '98pct_Fat_Free_Hamburger',
       'AA_Cell_Batteries', 'Apple_Cinnamon_Waffles', 'Apple_Drink',
       'Apple_Fruit_Roll',
       ...
       'White_Bread', 'White_Wine', 'White_Zinfandel_Wine', 'Whole_Corn',
       'Whole_Green_Beans', 'Whole_Milk', 'Window_Cleaner', 'Wood_Polish',
       'flav_Fruit_Bars', 'flav_Ice'],
      dtype='object', name='Product', length=303)


In [8]:
# Importation de la fonction apriori
from mlxtend.frequent_patterns import apriori

In [9]:
# itemsets_frequents
freq_itemsets = apriori(df, min_support=0.025, max_len=4, use_colnames=True)

In [10]:
type(freq_itemsets)

pandas.core.frame.DataFrame

In [13]:
freq_itemsets.columns

Index(['support', 'itemsets'], dtype='object')

In [14]:
freq_itemsets.shape

(603, 2)

In [15]:
freq_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.030147,(100_Watt_Lightbulb)
1,0.109559,(2pct_Milk)
2,0.0375,(60_Watt_Lightbulb)
3,0.031618,(75_Watt_Lightbulb)
4,0.093382,(98pct_Fat_Free_Hamburger)
5,0.031618,(AA_Cell_Batteries)
6,0.025735,(Apple_Cinnamon_Waffles)
7,0.026471,(Apple_Drink)
8,0.031618,(Apple_Fruit_Roll)
9,0.032353,(Apple_Jam)


In [17]:
type(freq_itemsets.itemsets)

pandas.core.series.Series

In [18]:
#accès indexé au premier élément
print(freq_itemsets.itemsets[0])

frozenset({'100_Watt_Lightbulb'})


Un itemset correspond à un objet frozenset de Python, un type ensemble (set) qui n’est pas
modifiable (on dit non-mutable). Plusieurs solutions s‘offrent à nous pour la recherche d’itemsets
répondant à des conditions de présence d’items.

## Solution 1 : Utilisation de la méthode apply() de pandas.series

In [20]:
#fonction de test d'inclusion : 
def is_inclus(x,items):
 return items.issubset(x)

In [21]:
#recherche des index des itemsets correspondant à une condition
import numpy as np
id = np.where(freq_itemsets.itemsets.apply(is_inclus,items={'Aspirin'}))
print(id)

(array([ 13, 208, 249, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
       282, 283, 284, 285, 286, 509, 510, 511, 552, 553, 554, 555, 556],
      dtype=int64),)


In [22]:
#affichage des itemsets corresp.
print(freq_itemsets.loc[id])

      support                              itemsets
13   0.066912                             (Aspirin)
208  0.034559                  (Aspirin, 2pct_Milk)
249  0.027941   (Aspirin, 98pct_Fat_Free_Hamburger)
272  0.027206                       (Aspirin, Cola)
273  0.025735              (Aspirin, Domestic_Beer)
274  0.038235                       (Eggs, Aspirin)
275  0.027206                   (Hot_Dogs, Aspirin)
276  0.027206                     (Onions, Aspirin)
277  0.027206   (Pepperoni_Pizza_-_Frozen, Aspirin)
278  0.025000               (Popcorn_Salt, Aspirin)
279  0.036029               (Aspirin, Potato_Chips)
280  0.030147                   (Potatoes, Aspirin)
281  0.030147               (Sweet_Relish, Aspirin)
282  0.028676               (Toilet_Paper, Aspirin)
283  0.025000                   (Tomatoes, Aspirin)
284  0.030882                 (Aspirin, Toothpaste)
285  0.025000                (Aspirin, Wheat_Bread)
286  0.041912                (White_Bread, Aspirin)
509  0.02573

Pour les impatients, nous pouvons définir à la volée la fonction de comparaison lors de l’appel de
la méthode apply() de Series via le mécanisme des fonctions anonymes lambda de Python.

In [24]:
#passer par une fonction lambda si on est préssé
np.where(freq_itemsets.itemsets.apply(lambda x,ensemble:ensemble.issubset(x),ensemble={'Aspirin'}))

(array([ 13, 208, 249, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
        282, 283, 284, 285, 286, 509, 510, 511, 552, 553, 554, 555, 556],
       dtype=int64),)

## Solution 2 : Utilisation des opérateurs de comparaison de pandas.Series

On va utiliser l'une des fonctions de comparaison vectorisées (Binary operators functions) :

In [25]:
#itemsets contenant Aspirin - passer par les méthodes natives de Series
print(freq_itemsets[freq_itemsets['itemsets'].ge({'Aspirin'})])

      support                              itemsets
13   0.066912                             (Aspirin)
208  0.034559                  (Aspirin, 2pct_Milk)
249  0.027941   (Aspirin, 98pct_Fat_Free_Hamburger)
272  0.027206                       (Aspirin, Cola)
273  0.025735              (Aspirin, Domestic_Beer)
274  0.038235                       (Eggs, Aspirin)
275  0.027206                   (Hot_Dogs, Aspirin)
276  0.027206                     (Onions, Aspirin)
277  0.027206   (Pepperoni_Pizza_-_Frozen, Aspirin)
278  0.025000               (Popcorn_Salt, Aspirin)
279  0.036029               (Aspirin, Potato_Chips)
280  0.030147                   (Potatoes, Aspirin)
281  0.030147               (Sweet_Relish, Aspirin)
282  0.028676               (Toilet_Paper, Aspirin)
283  0.025000                   (Tomatoes, Aspirin)
284  0.030882                 (Aspirin, Toothpaste)
285  0.025000                (Aspirin, Wheat_Bread)
286  0.041912                (White_Bread, Aspirin)
509  0.02573

In [26]:
#itemsets avec Aspirin
print(freq_itemsets[freq_itemsets['itemsets'].eq({'Aspirin'})])

     support   itemsets
13  0.066912  (Aspirin)


In [27]:
#itemsets contenant Aspirin et Eggs
print(freq_itemsets[freq_itemsets['itemsets'].ge({'Aspirin','Eggs'})])

      support                       itemsets
274  0.038235                (Eggs, Aspirin)
509  0.025735     (Eggs, Aspirin, 2pct_Milk)
552  0.025000  (Eggs, Aspirin, Potato_Chips)
553  0.029412   (Eggs, White_Bread, Aspirin)


In [28]:
#itemsets contenant Aspirin et Eggs
print(freq_itemsets[freq_itemsets['itemsets'].ge({'Eggs','Aspirin'})])

      support                       itemsets
274  0.038235                (Eggs, Aspirin)
509  0.025735     (Eggs, Aspirin, 2pct_Milk)
552  0.025000  (Eggs, Aspirin, Potato_Chips)
553  0.029412   (Eggs, White_Bread, Aspirin)
