In [9]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

<h1 style="color:blue;"> Toy Example </h1>

In [10]:
dataset = [['Lait', 'Oignon', 'Noix', 'Haricots Rouges', 'Oeufs', 'Yaourt'],
           ['Aneth', 'Oignon', 'Noix', 'Haricots Rouges', 'Oeufs', 'Yaourt'],
           ['Lait', 'Pomme', 'Haricots Rouges', 'Oeufs'],
           ['Lait', 'Maïs', 'Haricots Rouges', 'Yaourt'],
           ['Maïs', 'Oignon', 'Haricots Rouges', 'Glace', 'Oeufs']]


transactions = TransactionEncoder()
transactions_array = transactions.fit(dataset).transform(dataset)
df_train = pd.DataFrame(transactions_array, columns=transactions.columns_)
df_train.head(3)

Unnamed: 0,Aneth,Glace,Haricots Rouges,Lait,Maïs,Noix,Oeufs,Oignon,Pomme,Yaourt
0,False,False,True,True,False,True,True,True,False,True
1,True,False,True,False,False,True,True,True,False,True
2,False,False,True,True,False,False,True,False,True,False


### Recherche de motifs fréquents

In [11]:
frequent_itemsets_train = apriori(df_train, min_support=0.5, use_colnames=True)
frequent_itemsets_train

Unnamed: 0,support,itemsets
0,1.0,(Haricots Rouges)
1,0.6,(Lait)
2,0.8,(Oeufs)
3,0.6,(Oignon)
4,0.6,(Yaourt)
5,0.6,"(Lait, Haricots Rouges)"
6,0.8,"(Oeufs, Haricots Rouges)"
7,0.6,"(Oignon, Haricots Rouges)"
8,0.6,"(Yaourt, Haricots Rouges)"
9,0.6,"(Oeufs, Oignon)"


In [12]:
frequent_itemsets_train[frequent_itemsets_train['itemsets'].astype(str).str.contains("Lait")]

Unnamed: 0,support,itemsets
1,0.6,(Lait)
5,0.6,"(Lait, Haricots Rouges)"


### Recherche des règles d'association

In [13]:
from mlxtend.frequent_patterns import association_rules

In [14]:
training_rules = association_rules(frequent_itemsets_train, metric="confidence", min_threshold=0.7)
training_rules.sort_values(by=['confidence','lift'],ascending=False).iloc[:,:7]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
6,(Oignon),(Oeufs),0.6,0.8,0.6,1.0,1.25
9,"(Oignon, Haricots Rouges)",(Oeufs),0.6,0.8,0.6,1.0,1.25
11,(Oignon),"(Oeufs, Haricots Rouges)",0.6,0.8,0.6,1.0,1.25
0,(Lait),(Haricots Rouges),0.6,1.0,0.6,1.0,1.0
1,(Oeufs),(Haricots Rouges),0.8,1.0,0.8,1.0,1.0
3,(Oignon),(Haricots Rouges),0.6,1.0,0.6,1.0,1.0
4,(Yaourt),(Haricots Rouges),0.6,1.0,0.6,1.0,1.0
7,"(Oeufs, Oignon)",(Haricots Rouges),0.6,1.0,0.6,1.0,1.0
2,(Haricots Rouges),(Oeufs),1.0,0.8,0.8,0.8,1.0
5,(Oeufs),(Oignon),0.8,0.6,0.6,0.75,1.25


<h1 style="color:blue;"> Market Kaggle Data Example </h1>

In [15]:
commandes=pd.read_csv('./order_products__prior.csv',sep=',')
commandes.head(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [46]:
commandes.shape

(32434489, 4)

In [45]:
produits=pd.read_csv('./products.csv',sep=',')
produits.head(5)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [47]:
produits.shape

(49688, 4)

## Préparation des données

### Sélection des commandes les plus importantes

In [53]:
selected_c=100000
commandes_counts = commandes.groupby('order_id')['product_id'].count().reset_index().rename(
    columns = {'product_id':'frequence'})
commandes_counts = commandes_counts.sort_values('frequence', ascending=False)[:selected_c].reset_index(drop=True)
commandes_counts.head(10)

Unnamed: 0,order_id,frequence
0,1564244,145
1,790903,137
2,61355,127
3,2970392,121
4,2069920,116
5,3308010,115
6,2753324,114
7,2499774,112
8,2621625,109
9,77151,109


In [64]:
freq_commandes = list(commandes_counts.order_id)
commandes_selected = commandes[commandes.order_id.isin(freq_commandes)].reset_index(drop=True)
commandes_selected.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,106,4210,1,0
1,106,11210,2,0
2,106,2839,3,1


In [65]:
commandes_selected.shape

(3426742, 4)

### Sélection des produits les plus achetés

In [66]:
selected_p=100
produits_counts = commandes_selected.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequence'})
produits_counts = produits_counts.sort_values('frequence', ascending=False)[:selected_p].reset_index(drop=True)
produits_counts.head(10)

Unnamed: 0,product_id,frequence
0,24852,30893
1,13176,26536
2,21137,23769
3,21903,20101
4,47209,19999
5,26209,15407
6,27966,14952
7,47626,14823
8,47766,13786
9,22935,12218


In [67]:
freq_produits = list(produits_counts.product_id)
commandes_finales = commandes_selected[commandes_selected.product_id.isin(freq_produits)].reset_index(drop=True)
commandes_finales.shape

(713588, 4)

In [62]:
commandes_finales.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,106,24184,11,1
1,106,21938,16,1
2,106,20114,29,1


### Récupération des noms de produits

In [21]:
produits=pd.read_csv('./products.csv',sep=',')
produits.head(3)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7


In [70]:
commandes_finales_produits = commandes_finales.merge(produits, on='product_id')
commandes_finales_produits.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,106,24184,11,1,Red Peppers,83,4
1,1268,24184,25,1,Red Peppers,83,4
2,1287,24184,16,1,Red Peppers,83,4


### Création du Data Frame pour la recherche de motifs et de Règles d'association

In [80]:
panier = commandes_finales_produits.pivot_table(columns='product_name', index='order_id', values='reordered',
                                       aggfunc='count').reset_index().fillna(0).set_index('order_id')
panier=panier.astype(int)

In [81]:
panier.head(3)

product_name,100% Whole Wheat Bread,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Broccoli Crown,Bunched Cilantro,Cantaloupe,...,Shredded Parmesan,Small Hass Avocado,Sparkling Water Grapefruit,Strawberries,Total 2% with Strawberry Lowfat Greek Strained Yogurt,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Whipped Cream Cheese,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
228,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Recherche de motifs fréquents

In [88]:
frequent_itemsets_train = apriori(panier, min_support=0.05, 
                                  use_colnames=True).sort_values(by='support', ascending=False)
frequent_itemsets_train

Unnamed: 0,support,itemsets
4,0.319047,(Banana)
3,0.274050,(Bag of Organic Bananas)
47,0.245474,(Organic Strawberries)
21,0.207593,(Organic Baby Spinach)
36,0.206539,(Organic Hass Avocado)
...,...,...
5,0.051131,(Blueberries)
17,0.050853,(Orange Bell Pepper)
46,0.050285,(Organic Sticks Low Moisture Part Skim Mozzare...
74,0.050274,"(Organic Raspberries, Organic Hass Avocado)"


### Recherche des règles d'association

In [86]:
training_rules = association_rules(frequent_itemsets_train, metric="confidence", min_threshold=0.2)
training_rules.sort_values(by=['confidence','lift'],ascending=False).iloc[:,:7]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
22,(Strawberries),(Banana),0.116959,0.319047,0.055211,0.472053,1.479572
0,(Organic Hass Avocado),(Bag of Organic Bananas),0.206539,0.27405,0.092782,0.449222,1.639198
12,(Organic Raspberries),(Bag of Organic Bananas),0.154417,0.27405,0.067469,0.436932,1.594349
17,(Organic Avocado),(Banana),0.142375,0.319047,0.061428,0.431452,1.352316
16,(Large Lemon),(Banana),0.153084,0.319047,0.062647,0.409229,1.28266
20,(Organic Raspberries),(Organic Strawberries),0.154417,0.245474,0.05708,0.36965,1.50586
3,(Organic Strawberries),(Bag of Organic Bananas),0.245474,0.27405,0.089808,0.365855,1.334992
4,(Organic Hass Avocado),(Organic Strawberries),0.206539,0.245474,0.07411,0.358818,1.461735
9,(Organic Baby Spinach),(Bag of Organic Bananas),0.207593,0.27405,0.071218,0.343068,1.251842
1,(Bag of Organic Bananas),(Organic Hass Avocado),0.27405,0.206539,0.092782,0.338559,1.639198


In [7]:
from prefixspan import PrefixSpan

In [9]:
db = [
    ['a','b', 'a', 'c', 'd'],
    ['a','b', 'c', 'd'],
    ['a','b', 'a', ],
    ['a','b', 'a', 'c', 'd'],
]

ps = PrefixSpan(db)


In [65]:
print(ps.topk(5,closed=True))

[(91, ['ADE']), (87, ['PH7']), (81, ['ADE', 'ADE']), (75, ['ADE', 'PH7']), (75, ['AMI'])]


In [16]:
print(ps.frequent(3, closed=True))

[(4, ['a', 'b']), (3, ['a', 'b', 'a']), (3, ['a', 'b', 'c', 'd']), (3, ['a', 'c', 'd'])]


In [17]:
print(ps.frequent(3, generator=True))

[(3, ['a', 'a']), (3, ['b', 'a']), (3, ['c']), (3, ['d'])]


In [9]:
import pickle

In [10]:
parcours=pickle.load(open('./parcours_final_0_6.pickle','rb'))

In [11]:
parcours=pickle.load(open('./parcours_final_0_6.pickle','rb'))
db=parcours[parcours.MATERNITE==1].ACTE[:100].tolist()

In [39]:
ps = PrefixSpan(db)

In [29]:
print(ps.topk(5))

[(91, ['ADE']), (87, ['PH7']), (81, ['ADE', 'ADE']), (75, ['ADE', 'PH7']), (75, ['AMI'])]


In [61]:
coverage = [[] for i in range(len(db))]

def cover(patt, matches):
    for i, _ in matches:
        coverage[i] = max(coverage[i], patt, key=len)


ps.topk(5, callback=cover)

print(coverage)


[['PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'PH7'], ['ADE', 'ADE'], ['PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'PH7'], ['ADE', 'PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['PH7'], ['ADE', 'ADE'], ['ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['PH7'], ['ADE', 'ADE'], ['PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'PH7'], ['ADE', 'ADE'], ['PH7'], ['ADE', 'ADE'], ['ADE', 'PH7'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'], ['ADE', 'ADE'],

In [74]:
ps = PrefixSpan(db)
print(ps.topk(5))
ps.minlen=4
print()
print(ps.topk(5))


[(91, ['ADE']), (87, ['PH7']), (81, ['ADE', 'ADE']), (75, ['ADE', 'PH7']), (75, ['AMI'])]

[(54, ['AMI', 'MAU', 'AMI', 'MAU']), (52, ['ADE', 'ADE', 'AMI', 'MAU']), (49, ['ADE', 'AMI', 'MAU', 'ADE']), (49, ['AMI', 'MAU', 'ADE', 'ADE']), (48, ['AMI', 'MAU', 'ADE', 'AMI'])]


In [31]:
db=parcours[parcours.MATERNITE==1].ACTE.tolist()
ps = PrefixSpan(db)
ps.minlen=5
ps.maxlen=10

#liste_frequent=ps.frequent(45)
#print(liste_frequent)
#print(ps.topk(10))

print(ps.frequent(2000,generator=True))


KeyboardInterrupt: 

In [15]:
db[0]

['PH7',
 'TB',
 'TB',
 'ATU',
 'G',
 'PH2',
 'TB',
 'ATU',
 'G',
 'PMR',
 'HN',
 'TB',
 'PH4']

In [23]:
coverage = [[] for i in range(len(db))]

def cover(patt, matches):
    for i, _ in matches:
        coverage[i] = max(coverage[i], patt, key=len)
        
ps.topk(10,callback=cover)
print(coverage)


[[], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['ADE', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'ADE', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'ADE'], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], [], ['AMI', 'MAU', 'ADE', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], [], [], ['AMI', 'MAU', 'AMI', 'MAU', 'AMI', 'MAU'], ['AMI',