In [1]:
# pip install efficient_apriori
# pip install fptools

In [2]:
import pandas as pd
import numpy as np
from efficient_apriori import apriori as EA
from mlxtend.frequent_patterns import apriori, association_rules

# 加载数据：

In [3]:
df = pd.read_csv('./Market_Basket_Optimisation.csv', header=None)

In [4]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,


# 使用efficient_apriori进行关联分析：
---
## 将数据存入transaction

In [5]:
# # 用于存商品数据
# transactions = []

# for i in range(0, df.shape[0]):
#     # 每一笔订单
#     temp = set()
#     for j in range(0, df.shape[1]):
#         item = str(df.values[i, j])
#         if item != 'nan':
#             temp.add(item)
#     transactions.append(temp)

### 订单去重后进行分析

In [6]:
# 测试第二行删掉空值后转为列表的效果
df.loc[1].dropna().to_list()

['burgers', 'meatballs', 'eggs']

In [7]:
# 等价于上一个for循环的代码
# 用于存商品数据
transactions = []

# 遍历每一行
for i in range(0, df.shape[0]):
    # 每一笔订单，外层增加了set()对订单中的商品进行去重
    temp = set(df.loc[i].dropna().to_list())
    transactions.append(temp)
    
transactions

[{'almonds',
  'antioxydant juice',
  'avocado',
  'cottage cheese',
  'energy drink',
  'frozen smoothie',
  'green grapes',
  'green tea',
  'honey',
  'low fat yogurt',
  'mineral water',
  'olive oil',
  'salad',
  'salmon',
  'shrimp',
  'spinach',
  'tomato juice',
  'vegetables mix',
  'whole weat flour',
  'yams'},
 {'burgers', 'eggs', 'meatballs'},
 {'chutney'},
 {'avocado', 'turkey'},
 {'energy bar', 'green tea', 'milk', 'mineral water', 'whole wheat rice'},
 {'low fat yogurt'},
 {'french fries', 'whole wheat pasta'},
 {'light cream', 'shallot', 'soup'},
 {'frozen vegetables', 'green tea', 'spaghetti'},
 {'french fries'},
 {'eggs', 'pet food'},
 {'cookies'},
 {'burgers', 'cooking oil', 'eggs', 'mineral water', 'turkey'},
 {'champagne', 'cookies', 'spaghetti'},
 {'mineral water', 'salmon'},
 {'mineral water'},
 {'chicken',
  'chocolate',
  'cooking oil',
  'honey',
  'low fat yogurt',
  'oil',
  'shrimp'},
 {'eggs', 'turkey'},
 {'black tea',
  'chicken',
  'eggs',
  'extra dar

In [8]:
len(transactions)

7501

#### 计算频繁项集和关联规则

```
apriori(
    transactions: Union[List[tuple], Callable],
    min_support: float = 0.5,
    min_confidence: float = 0.5,
    max_length: int = 8,
    verbosity: int = 0,
    output_transaction_ids: bool = False,
)
```

In [9]:
%%time
itemsets, rules = EA(transactions, min_support=0.04, min_confidence=0.2)

CPU times: user 465 ms, sys: 0 ns, total: 465 ms
Wall time: 472 ms


In [10]:
print("频繁项集:\n", itemsets)
print("关联规则:\n", rules)

频繁项集:
 {1: {('green tea',): 991, ('salmon',): 319, ('shrimp',): 536, ('mineral water',): 1788, ('olive oil',): 494, ('low fat yogurt',): 574, ('honey',): 356, ('frozen smoothie',): 475, ('burgers',): 654, ('eggs',): 1348, ('turkey',): 469, ('whole wheat rice',): 439, ('milk',): 972, ('french fries',): 1282, ('soup',): 379, ('spaghetti',): 1306, ('frozen vegetables',): 715, ('cookies',): 603, ('cooking oil',): 383, ('champagne',): 351, ('chocolate',): 1229, ('chicken',): 450, ('tomatoes',): 513, ('pancakes',): 713, ('grated cheese',): 393, ('fresh bread',): 323, ('escalope',): 595, ('ground beef',): 737, ('herb & pepper',): 371, ('cake',): 608}, 2: {('milk', 'mineral water'): 360, ('eggs', 'mineral water'): 382, ('mineral water', 'spaghetti'): 448, ('ground beef', 'mineral water'): 307, ('chocolate', 'mineral water'): 395}}
关联规则:
 [{mineral water} -> {milk}, {milk} -> {mineral water}, {mineral water} -> {eggs}, {eggs} -> {mineral water}, {spaghetti} -> {mineral water}, {mineral water} -

#### 探索Rule对象的属性

In [11]:
type(itemsets), type(rules)

(dict, list)

In [12]:
type(itemsets[1]), type(rules[0])

(dict, efficient_apriori.rules.Rule)

In [13]:
len(rules)

9

In [14]:
# 查看第一个关联规则的支持度、置信度、提升度
rules[0].support, rules[0].confidence, rules[0].lift

(0.04799360085321957, 0.20134228187919462, 1.5537741320739082)

对于规则{Diaper}→{Beer}，{Diaper}叫做前件，{Beer}叫做后件。

In [15]:
# conviction 代表什么？The conviction of a rule X -> Y is the ratio P(not Y) / P(not Y | X).
# count_full 前件和后件同时出现在一个订单中的订单数
# count_lhs 前件出现在订单中的订单数
rules[0].conviction, rules[0].count_full, rules[0].count_lhs

(1.0898502034820343, 360, 1788)

In [16]:
# count_rhs 后件出现在订单中的订单数
# lhs 前件的值, rhs后件的值
rules[0].count_rhs, rules[0].lhs, rules[0].rhs

(972, ('mineral water',), ('milk',))

In [17]:
# 订单数量
rules[0].num_transactions

7501

In [18]:
print("rules[0]的支持度:", 360 / 7501) # rules[0].count_full / rules[0].num_transactions
print("rules[0]的置信度:", 360 / 1788) # rules[0].count_full / rules[0].count_lhs

rules[0]的支持度: 0.04799360085321957
rules[0]的置信度: 0.20134228187919462


In [19]:
# rules[0]的置信度 / 后件的支持度
print("rules[0]的支持度:", (360 / 1788) / (972 / 7501))

rules[0]的支持度: 1.5537741320739085


In [20]:
# rules[0].conviction的值 P(not Y) / P(not Y | X).
((7501 - 972) / 7501) / ((1788 - 360) / 1788)

1.0898502048466367

In [21]:
rules[0].rpf  # RPF（Rule Power Factor）是置信度乘以支持度。

0.00966314111138649

### 订单不去重进行分析

In [22]:
# 不使用set()去重
# 用于存商品数据
transactions = []
duplicated_or_no = []

for i in range(0, df.shape[0]):
    # 每一笔订单
    temp = df.loc[i].dropna().to_list()
    transactions.append(temp)
    
transactions

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers', 'meatballs', 'eggs'],
 ['chutney'],
 ['turkey', 'avocado'],
 ['mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea'],
 ['low fat yogurt'],
 ['whole wheat pasta', 'french fries'],
 ['soup', 'light cream', 'shallot'],
 ['frozen vegetables', 'spaghetti', 'green tea'],
 ['french fries'],
 ['eggs', 'pet food'],
 ['cookies'],
 ['turkey', 'burgers', 'mineral water', 'eggs', 'cooking oil'],
 ['spaghetti', 'champagne', 'cookies'],
 ['mineral water', 'salmon'],
 ['mineral water'],
 ['shrimp',
  'chocolate',
  'chicken',
  'honey',
  'oil',
  'cooking oil',
  'low fat yogurt'],
 ['turkey', 'eggs'],
 ['turkey',
  'fresh tuna',
  'tomatoes',
  'spagh

#### 计算频繁项集和关联规则

In [23]:
%%time
itemsets, rules = EA(transactions, min_support=0.04, min_confidence=0.2)

CPU times: user 446 ms, sys: 0 ns, total: 446 ms
Wall time: 457 ms


In [24]:
print("频繁项集:\n", itemsets)
print("关联规则:\n", rules)

频繁项集:
 {1: {('green tea',): 991, ('shrimp',): 536, ('low fat yogurt',): 574, ('frozen smoothie',): 475, ('salmon',): 319, ('mineral water',): 1788, ('olive oil',): 494, ('honey',): 356, ('burgers',): 654, ('eggs',): 1348, ('turkey',): 469, ('whole wheat rice',): 439, ('milk',): 972, ('french fries',): 1282, ('soup',): 379, ('spaghetti',): 1306, ('frozen vegetables',): 715, ('cookies',): 603, ('cooking oil',): 383, ('champagne',): 351, ('chocolate',): 1229, ('chicken',): 450, ('tomatoes',): 513, ('pancakes',): 713, ('grated cheese',): 393, ('fresh bread',): 323, ('escalope',): 595, ('ground beef',): 737, ('herb & pepper',): 371, ('cake',): 608}, 2: {('milk', 'mineral water'): 360, ('eggs', 'mineral water'): 382, ('mineral water', 'spaghetti'): 448, ('ground beef', 'mineral water'): 307, ('chocolate', 'mineral water'): 395}}
关联规则:
 [{mineral water} -> {milk}, {milk} -> {mineral water}, {mineral water} -> {eggs}, {eggs} -> {mineral water}, {spaghetti} -> {mineral water}, {mineral water} -

In [25]:
len(rules)  # 关联规则的数量有9个

9

看起来去重前后结果没有变化，在2.2中使用短数据集再验证下

In [26]:
df.loc[4494].dropna() # 存在重复商品

0            ham
1           eggs
2          honey
3           gums
4    light cream
5            ham
Name: 4494, dtype: object

In [27]:
df.loc[4394].dropna() # 存在重复商品

0             burgers
1                 ham
2                eggs
3    whole wheat rice
4                 ham
5        french fries
6             cookies
7           green tea
Name: 4394, dtype: object

## 测试订单中重复商品是否影响efficient_apriori关联分析结果：

In [28]:
transactions_ = [('eggs', 'bacon', 'soup', 'soup'),
                ('eggs', 'bacon', 'apple', 'bacon'),
                ('soup', 'bacon', 'banana')]
itemsets, rules = EA(transactions_, min_support=0.5, min_confidence=1) 
print("频繁项集:\n", itemsets)
print("关联规则:\n", rules)

频繁项集:
 {1: {('bacon',): 3, ('soup',): 2, ('eggs',): 2}, 2: {('bacon', 'eggs'): 2, ('bacon', 'soup'): 2}}
关联规则:
 [{eggs} -> {bacon}, {soup} -> {bacon}]


**小结：通过这个例子可以看出，当同一个元组(也就是同一个订单)中存在重复商品时，不影响频繁项集的计数结果。因为它统计的是某商品在不同订单中出现的次数，而不是同一个订单中的出现次数。**

# 使用mlxtend进行关联分析
---
尝试两种不同的df处理方法

## df处理方法1:列字段合并再进行独热编码

In [29]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


### 创建空DataFrame
---
用于存储拼接后的字符串

In [30]:
df_new = pd.DataFrame(columns=['items'])
df_new

Unnamed: 0,items


### 将df每行中的商品拼接成一个字符串
---
并存入新的DataFrame中

In [31]:
%%time
for i in range(df.shape[0]):
    df_new.loc[i] = df.loc[i].str.cat(sep='/')

CPU times: user 27.8 s, sys: 50.9 ms, total: 27.8 s
Wall time: 28 s


In [32]:
df_new 

Unnamed: 0,items
0,shrimp/almonds/avocado/vegetables mix/green gr...
1,burgers/meatballs/eggs
2,chutney
3,turkey/avocado
4,mineral water/milk/energy bar/whole wheat rice...
...,...
7496,butter/light mayo/fresh bread
7497,burgers/frozen vegetables/eggs/french fries/ma...
7498,chicken
7499,escalope/green tea


In [33]:
type(df_new.loc[0, 'items'])

str

### 对新DataFrame进行one-hot编码

In [34]:
one_hot_df = df_new['items'].str.get_dummies(sep="/")
one_hot_df

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
one_hot_df.max(axis=0).sum()  # 所有列最大值的和=120，说明每一列最大值均为1，符合one-hot编码的特征

120

### 获取频繁项集通过one-hot编码的DataFrame对象

```
apriori(
    df,
    min_support=0.5,
    use_colnames=False,
    max_len=None,
    verbose=0,
    low_memory=False,
)
```

In [36]:
%%time
itemsets = apriori(one_hot_df, min_support=0.04, use_colnames=True)

CPU times: user 51.1 ms, sys: 56.9 ms, total: 108 ms
Wall time: 107 ms


In [37]:
itemsets.sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets
20,0.238368,(mineral water)
7,0.179709,(eggs)
26,0.17411,(spaghetti)
9,0.170911,(french fries)
4,0.163845,(chocolate)
14,0.132116,(green tea)
19,0.129583,(milk)
15,0.098254,(ground beef)
12,0.095321,(frozen vegetables)
22,0.095054,(pancakes)


### 从频繁项集中找出符合条件的关联条件

[利用mlxtend进行数据关联分析，查看返回值各列含义](https://blog.csdn.net/qq_36523839/article/details/83960195)
```
association_rules(
    df,
    metric='confidence',
    min_threshold=0.8,
    support_only=False,
)```

In [38]:
%%time
rules = association_rules(itemsets, metric='lift', min_threshold=1)

CPU times: user 4.95 ms, sys: 1.98 ms, total: 6.93 ms
Wall time: 6.7 ms


In [39]:
rules.sort_values(by=['lift'], ascending=False) # 按规则提升度降序排序

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(mineral water),(ground beef),0.238368,0.098254,0.040928,0.1717,1.747522,0.017507,1.088672
5,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401
6,(milk),(mineral water),0.129583,0.238368,0.047994,0.37037,1.553774,0.017105,1.20965
7,(mineral water),(milk),0.238368,0.129583,0.047994,0.201342,1.553774,0.017105,1.08985
8,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
9,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
0,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
1,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256
2,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,0.00809,1.062815
3,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,0.00809,1.043158


In [40]:
'mineral water' in one_hot_df.columns

True

## df处理方法2：直接整行读入CSV
---
这种方法感觉更适合mlxtend包的apriori算法

### 数据加载

In [41]:
df = pd.read_csv('Market_Basket_Optimisation.csv', sep='\t', header=None)

In [42]:
df

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


### df独热编码

In [43]:
%%time
df_one_hot = df[0].str.get_dummies(sep=',')

CPU times: user 1.22 s, sys: 57.9 ms, total: 1.28 s
Wall time: 1.28 s


In [44]:
df_one_hot

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
df_one_hot.max().sum()  # 校验是否最大值都是1

120

### 计算频繁项集：

In [46]:
%%time
# 按最小支持度0.04统计频繁项集
frequence_items = apriori(df_one_hot, min_support=0.04, use_colnames=True)

CPU times: user 50.2 ms, sys: 53.9 ms, total: 104 ms
Wall time: 103 ms


In [47]:
%%time
# 按支持度降序查看频繁项集df
frequence_items.sort_values(by=['support'], ascending=False)

CPU times: user 957 µs, sys: 0 ns, total: 957 µs
Wall time: 991 µs


Unnamed: 0,support,itemsets
20,0.238368,(mineral water)
7,0.179709,(eggs)
26,0.17411,(spaghetti)
9,0.170911,(french fries)
4,0.163845,(chocolate)
14,0.132116,(green tea)
19,0.129583,(milk)
15,0.098254,(ground beef)
12,0.095321,(frozen vegetables)
22,0.095054,(pancakes)


### 根据频繁项集求关联规则

In [48]:
%%time
rules = association_rules(frequence_items, metric='lift', min_threshold=1)
rules.sort_values(by='lift', ascending=False)

CPU times: user 7.86 ms, sys: 0 ns, total: 7.86 ms
Wall time: 7.76 ms


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(mineral water),(ground beef),0.238368,0.098254,0.040928,0.1717,1.747522,0.017507,1.088672
5,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401
6,(milk),(mineral water),0.129583,0.238368,0.047994,0.37037,1.553774,0.017105,1.20965
7,(mineral water),(milk),0.238368,0.129583,0.047994,0.201342,1.553774,0.017105,1.08985
8,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
9,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
0,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
1,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256
2,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,0.00809,1.062815
3,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,0.00809,1.043158


---
# efficient_apriori和mlxtend的apriori算法比较：
---
除了efficient_apriori效率高，返回参数少；mlxtend效率低，返回参数多以外：
1. efficient_apriori的apriori算法，频繁项集和关联规则是一起输出；且只能通过最小置信度来获取关联规则；
2. mlxtend的apriori算法，提供2个方法，一个输出计算频繁项集，一个输出计算关联规则，支持最小置信度、最小支持度等8个指标来获取关联规则；
3. mlxtend的apriori算法结果更加清晰直观；
4. efficient_apriori的apriori算法，得到的关联规则中包含的支持度等信息是隐藏在Rule对象中的，需要单独打印Rule对象才能看到，不如mlxtend的结果直观。

# 尝试FPGrowth算法
---
纯粹尝试

## fptools包

In [49]:
import fptools as fp

In [50]:
transactions

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers', 'meatballs', 'eggs'],
 ['chutney'],
 ['turkey', 'avocado'],
 ['mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea'],
 ['low fat yogurt'],
 ['whole wheat pasta', 'french fries'],
 ['soup', 'light cream', 'shallot'],
 ['frozen vegetables', 'spaghetti', 'green tea'],
 ['french fries'],
 ['eggs', 'pet food'],
 ['cookies'],
 ['turkey', 'burgers', 'mineral water', 'eggs', 'cooking oil'],
 ['spaghetti', 'champagne', 'cookies'],
 ['mineral water', 'salmon'],
 ['mineral water'],
 ['shrimp',
  'chocolate',
  'chicken',
  'honey',
  'oil',
  'cooking oil',
  'low fat yogurt'],
 ['turkey', 'eggs'],
 ['turkey',
  'fresh tuna',
  'tomatoes',
  'spagh

In [51]:
# fp.fpgrowth()  # 在给定的树上执行fpgrowth算法以产生所有频繁项集。

In [52]:
# fp.frequent_itemsets(itemsets, minsup)  # 启动FP增长算法

In [53]:
%%time
# 构建FPGrowth树
tree = fp.build_tree(transactions, minsup=300)

CPU times: user 44.7 ms, sys: 1.01 ms, total: 45.7 ms
Wall time: 45 ms


In [54]:
tree

(<fptools.FPTree at 0x7f28ef87d6d0>,
 {'salmon': 0,
  'fresh bread': 1,
  'champagne': 2,
  'honey': 3,
  'herb & pepper': 4,
  'soup': 5,
  'cooking oil': 6,
  'grated cheese': 7,
  'whole wheat rice': 8,
  'chicken': 9,
  'turkey': 10,
  'frozen smoothie': 11,
  'olive oil': 12,
  'tomatoes': 13,
  'shrimp': 14,
  'low fat yogurt': 15,
  'escalope': 16,
  'cookies': 17,
  'cake': 18,
  'burgers': 19,
  'pancakes': 20,
  'frozen vegetables': 21,
  'ground beef': 22,
  'milk': 23,
  'green tea': 24,
  'chocolate': 25,
  'french fries': 26,
  'spaghetti': 27,
  'eggs': 28,
  'mineral water': 29})

In [55]:
len(tree)

2

In [56]:
print(type(tree[0]),  type(tree[1]))

<class 'fptools.FPTree'> <class 'dict'>


In [57]:
tree[0].nodes

defaultdict(list,
            {'mineral water': [<fptools.FPNode at 0x7f28f398d110>],
             'green tea': [<fptools.FPNode at 0x7f28f398d090>,
              <fptools.FPNode at 0x7f28f39af810>,
              <fptools.FPNode at 0x7f28ef8933d0>,
              <fptools.FPNode at 0x7f28ef806810>,
              <fptools.FPNode at 0x7f28ef806990>,
              <fptools.FPNode at 0x7f28ef806290>,
              <fptools.FPNode at 0x7f28ef76e3d0>,
              <fptools.FPNode at 0x7f28ef76e210>,
              <fptools.FPNode at 0x7f28edba53d0>,
              <fptools.FPNode at 0x7f28ef808390>,
              <fptools.FPNode at 0x7f28ef808ad0>,
              <fptools.FPNode at 0x7f28ef811890>,
              <fptools.FPNode at 0x7f28ef811d90>,
              <fptools.FPNode at 0x7f28ef817650>,
              <fptools.FPNode at 0x7f28ef821090>,
              <fptools.FPNode at 0x7f28ef801890>,
              <fptools.FPNode at 0x7f28ef8ad210>,
              <fptools.FPNode at 0x7f28ef8adb10>,
 

In [58]:
len(tree[0].rank)

30

In [59]:
tree[0].rank  # 项头表

{'salmon': 0,
 'fresh bread': 1,
 'champagne': 2,
 'honey': 3,
 'herb & pepper': 4,
 'soup': 5,
 'cooking oil': 6,
 'grated cheese': 7,
 'whole wheat rice': 8,
 'chicken': 9,
 'turkey': 10,
 'frozen smoothie': 11,
 'olive oil': 12,
 'tomatoes': 13,
 'shrimp': 14,
 'low fat yogurt': 15,
 'escalope': 16,
 'cookies': 17,
 'cake': 18,
 'burgers': 19,
 'pancakes': 20,
 'frozen vegetables': 21,
 'ground beef': 22,
 'milk': 23,
 'green tea': 24,
 'chocolate': 25,
 'french fries': 26,
 'spaghetti': 27,
 'eggs': 28,
 'mineral water': 29}

In [60]:
type(tree[0].nodes['mineral water'][0])

fptools.FPNode

In [61]:
mineral_water_node = tree[0].nodes['mineral water'][0]
mineral_water_node

<fptools.FPNode at 0x7f28f398d110>

In [62]:
mineral_water_node.children # 子孩子节点

defaultdict(fptools.FPNode,
            {'green tea': <fptools.FPNode at 0x7f28f398d090>,
             'eggs': <fptools.FPNode at 0x7f28ef893a10>,
             'salmon': <fptools.FPNode at 0x7f28ef893c50>,
             'spaghetti': <fptools.FPNode at 0x7f28ef893190>,
             'ground beef': <fptools.FPNode at 0x7f28ef8067d0>,
             'cake': <fptools.FPNode at 0x7f28ef806750>,
             'chicken': <fptools.FPNode at 0x7f28ef806590>,
             'chocolate': <fptools.FPNode at 0x7f28ef8062d0>,
             'french fries': <fptools.FPNode at 0x7f28ef76e590>,
             'olive oil': <fptools.FPNode at 0x7f28ef76e610>,
             'frozen vegetables': <fptools.FPNode at 0x7f28ef808cd0>,
             'turkey': <fptools.FPNode at 0x7f28ef811290>,
             'shrimp': <fptools.FPNode at 0x7f28ef806510>,
             'fresh bread': <fptools.FPNode at 0x7f28ef811b50>,
             'frozen smoothie': <fptools.FPNode at 0x7f28ef817410>,
             'honey': <fptools.FPNode at 0

In [63]:
mineral_water_node.count

1788

In [64]:
mineral_water_node.item

'mineral water'

In [65]:
green_tea_node = mineral_water_node.children['green tea']
green_tea_node

<fptools.FPNode at 0x7f28f398d090>

In [66]:
green_tea_node.count

78

In [67]:
green_tea_node.parent.item

'mineral water'

In [68]:
%%time
items = [i for i in fp.fpgrowth(tree[0], 300)]

CPU times: user 35.9 ms, sys: 1.96 ms, total: 37.8 ms
Wall time: 39.5 ms


In [69]:
print(len(items))
items

35


[['mineral water'],
 ['green tea'],
 ['low fat yogurt'],
 ['shrimp'],
 ['olive oil'],
 ['frozen smoothie'],
 ['honey'],
 ['salmon'],
 ['eggs'],
 ['eggs', 'mineral water'],
 ['burgers'],
 ['turkey'],
 ['milk'],
 ['milk', 'mineral water'],
 ['whole wheat rice'],
 ['french fries'],
 ['soup'],
 ['spaghetti'],
 ['spaghetti', 'mineral water'],
 ['frozen vegetables'],
 ['cookies'],
 ['cooking oil'],
 ['champagne'],
 ['chocolate'],
 ['chocolate', 'mineral water'],
 ['chicken'],
 ['tomatoes'],
 ['pancakes'],
 ['grated cheese'],
 ['fresh bread'],
 ['ground beef'],
 ['ground beef', 'mineral water'],
 ['escalope'],
 ['herb & pepper'],
 ['cake']]

In [70]:
# 最小支持频数统计频繁项集，返回 生成器
generate = fp.frequent_itemsets(transactions, minsup=300) 

In [71]:
generate

<generator object frequent_itemsets at 0x7f28f3b96750>

In [72]:
itemsets = [s for s in generate]
print(len(itemsets))
itemsets

35


[['mineral water'],
 ['green tea'],
 ['low fat yogurt'],
 ['shrimp'],
 ['olive oil'],
 ['frozen smoothie'],
 ['honey'],
 ['salmon'],
 ['eggs'],
 ['eggs', 'mineral water'],
 ['burgers'],
 ['turkey'],
 ['milk'],
 ['milk', 'mineral water'],
 ['whole wheat rice'],
 ['french fries'],
 ['soup'],
 ['spaghetti'],
 ['spaghetti', 'mineral water'],
 ['frozen vegetables'],
 ['cookies'],
 ['cooking oil'],
 ['champagne'],
 ['chocolate'],
 ['chocolate', 'mineral water'],
 ['chicken'],
 ['tomatoes'],
 ['pancakes'],
 ['grated cheese'],
 ['fresh bread'],
 ['ground beef'],
 ['ground beef', 'mineral water'],
 ['escalope'],
 ['herb & pepper'],
 ['cake']]

In [73]:
type(itemsets[0][0])

str

In [74]:
one_hot_df.sum(axis=0).sort_values(ascending=False)[:30]

mineral water        1788
eggs                 1348
spaghetti            1306
french fries         1282
chocolate            1229
green tea             991
milk                  972
ground beef           737
frozen vegetables     715
pancakes              713
burgers               654
cake                  608
cookies               603
escalope              595
low fat yogurt        574
shrimp                536
tomatoes              513
olive oil             494
frozen smoothie       475
turkey                469
chicken               450
whole wheat rice      439
grated cheese         393
cooking oil           383
soup                  379
herb & pepper         371
honey                 356
champagne             351
fresh bread           323
salmon                319
dtype: int64

**小结：FPGrowth的算法内，只找到支持按照最小支持度来求频繁项集的方法。计算效率确实快。它的最小支持度是最小出现次数，和前面两种算法的最小出现概率不同。另外就是它得到的频繁项集中不会显示数据。**