### 以超市购物为例理解关联规则:

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

data = {
    'ID': [1, 2, 3, 4, 5, 6],
    'Onion': [1, 0, 0, 1, 1, 1],
    'Potato': [1, 1, 0, 1, 1, 1],
    'Burger': [1, 1, 0, 0, 1, 1],
    'Milk': [0, 1, 1, 1, 0, 1],
    'Beer': [0, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

In [2]:
df

Unnamed: 0,ID,Onion,Potato,Burger,Milk,Beer
0,1,1,1,1,0,0
1,2,0,1,1,1,0
2,3,0,0,0,1,1
3,4,1,1,0,1,0
4,5,1,1,1,0,1
5,6,1,1,1,1,0


#### 设置支持度选择频繁项集
    1、选择最小支持度为50%

In [3]:
frequent_items = apriori(df[['Onion', 'Potato', 'Burger', 'Milk', 'Beer']], min_support=0.5, use_colnames=True)

In [5]:
frequent_items.sort_values(by=['support'], inplace=True)

In [6]:
frequent_items

Unnamed: 0,support,itemsets
5,0.5,"(Burger, Onion)"
7,0.5,"(Milk, Potato)"
8,0.5,"(Burger, Onion, Potato)"
0,0.666667,(Onion)
2,0.666667,(Burger)
3,0.666667,(Milk)
4,0.666667,"(Onion, Potato)"
6,0.666667,"(Burger, Potato)"
1,0.833333,(Potato)


返回的项集支持度均>=50%

#### 计算规则
    associate_rules(df, metric='lift', min_threshold=1, support_only=False)
    可以指定不同的衡量标准与最小阈值
    metric参数：supprot(支持度)、confidence(置信度)、lift(提升度)、leverage、conviction。根据该参数来过滤掉小于最小阈值的项集
    min_threshold：最小阈值
    support_only

In [17]:
rules = association_rules(frequent_items, metric='lift', min_threshold=1)

In [20]:
rules[rules['confidence']>= 0.8]   # 筛选出置信度大于0.8的项集

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf
8,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
10,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf


#### 从上述结果可得:
    1、Onion和Potato可以互相搭配
    2、如果Buger,Onion在购物篮，购买Potato的可能性也较高，可以推荐一下

#### 上述示例存在的问题：
    实际数据集不会像data这样标准；因此对数据集需要进行处理

In [44]:
retail_shopping_basket = {
    'ID': [1, 2, 3, 4, 5, 6],
    'Basket': [
        ['Beer', 'Diaper', 'Pretzels', 'Chips', 'Aspirin'],
        ['Diaper', 'Beer', 'Chips', 'Lotion', 'Juice', 'BabyFood', 'Milk'],
        ['Soda', 'Chips', 'Milk'],
        ['Soup', 'Beer', 'Diaper', 'Milk', 'IceCream'],
        ['Soda', 'Coffee', 'Milk', 'Bread'],
        ['Beer', 'Chips']
    ]
}

In [45]:
retail_df = pd.DataFrame(retail_shopping_basket)

In [46]:
retail_df

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, BabyFood, Milk]"
2,3,"[Soda, Chips, Milk]"
3,4,"[Soup, Beer, Diaper, Milk, IceCream]"
4,5,"[Soda, Coffee, Milk, Bread]"
5,6,"[Beer, Chips]"


In [47]:
pd.options.display.max_colwidth=100

In [48]:
retail_df

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, BabyFood, Milk]"
2,3,"[Soda, Chips, Milk]"
3,4,"[Soup, Beer, Diaper, Milk, IceCream]"
4,5,"[Soda, Coffee, Milk, Bread]"
5,6,"[Beer, Chips]"


#### 如上所示，数据集中都是字符串组成，需要转换成数值编码

In [49]:
retail_basket = retail_df.Basket.str.join(',')  # 将列表以,连接成字符串形式

In [50]:
retail_basket

0              Beer,Diaper,Pretzels,Chips,Aspirin
1    Diaper,Beer,Chips,Lotion,Juice,BabyFood,Milk
2                                 Soda,Chips,Milk
3                  Soup,Beer,Diaper,Milk,IceCream
4                          Soda,Coffee,Milk,Bread
5                                      Beer,Chips
Name: Basket, dtype: object

In [51]:
retail_basket.str.get_dummies(',')   # 对数据集进行one-hot编码

Unnamed: 0,Aspirin,BabyFood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,0,1,1,0,1,0,1,0,1,1,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,0,0,1,0,0,0,1,1,0,0,1,0,0,1
4,0,0,0,1,0,1,0,0,0,0,1,0,1,0
5,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [34]:
retail = retail_df[['ID']].join(retail_basket.str.get_dummies(','))   # 将商品ID和商品信息拼接起来

In [35]:
retail

Unnamed: 0,ID,Aspirin,BabyFood,Bear,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0
1,2,0,1,0,1,0,1,0,1,0,1,1,1,0,0,0
2,3,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,4,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1
4,5,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0
5,6,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0


#### 如上columns为所有的商品信息，其值为是否购买该商品

In [52]:
frequency_items = apriori(retail.drop('ID', 1), min_support=0.5, use_colnames=True)

In [53]:
frequency_items

Unnamed: 0,support,itemsets
0,0.5,(Beer)
1,0.666667,(Chips)
2,0.5,(Diaper)
3,0.666667,(Milk)
4,0.5,"(Diaper, Beer)"


In [54]:
association_rules(frequency_items, metric='lift')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Diaper),(Beer),0.5,0.5,0.5,1.0,2.0,0.25,inf
1,(Beer),(Diaper),0.5,0.5,0.5,1.0,2.0,0.25,inf
