In [None]:
# !pip install mlxtend

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
# mlxtend :  일상적인 데이터 사이언스 작업에 유용한 도구들로 구성된 파이썬 라이브러리

In [3]:
dataset = [['Milk', 'Cookie', 'Apple', 'Beans', 'Eggs', 'Yogurt'],
['Coke', 'Cookie', 'Apple', 'Beans', 'Eggs', 'Yogurt'], 
['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
['Milk', 'Orange', 'Corn', 'Beans', 'Yogurt'],
['Corn', 'Cookie', 'Cookie', 'Beans', 'Ice cream', 'Eggs']]

# fit()
# dataset에 있는 모든 단어(상품)를 중복제거한 뒤
# a-z까지 sort한 뒤 index 번호를 부여
# 컬럼이 첫번째 단어, 두번째 단어, 세번째 단어, ...

# transform
# dataset에 있는 각 단어(상품)에 fit에 있는 번호에 해당하는지를 작업
# 예) MILK는 3, Cookie는 2번, Apple은 0번, Beans는 1번을 fit되었다면  
# Apple, Beans, Cookie, Milk
# [1,      1,      1,      1]
# [1,      1,      1,      0]
te = TransactionEncoder()
te.fit(dataset)
te.columns_

['Apple',
 'Beans',
 'Coke',
 'Cookie',
 'Corn',
 'Eggs',
 'Ice cream',
 'Kidney Beans',
 'Milk',
 'Orange',
 'Yogurt']

In [4]:
te_ary = te.fit(dataset).transform(dataset)
te_ary

array([[ True,  True, False,  True, False,  True, False, False,  True,
        False,  True],
       [ True,  True,  True,  True, False,  True, False, False, False,
        False,  True],
       [ True, False, False, False, False,  True, False,  True,  True,
        False, False],
       [False,  True, False, False,  True, False, False, False,  True,
         True,  True],
       [False,  True, False,  True,  True,  True,  True, False, False,
        False, False]])

In [5]:
te_ary.astype(int)

array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
       [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0]])

In [16]:
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Apple,Beans,Coke,Cookie,Corn,Eggs,Ice cream,Kidney Beans,Milk,Orange,Yogurt
0,True,True,False,True,False,True,False,False,True,False,True
1,True,True,True,True,False,True,False,False,False,False,True
2,True,False,False,False,False,True,False,True,True,False,False
3,False,True,False,False,True,False,False,False,True,True,True
4,False,True,False,True,True,True,True,False,False,False,False


In [10]:
import numpy as np
aa = np.array(df)
aa

array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
       [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0]])

In [11]:
aa.sum(axis=0)/len(aa)

array([0.6, 0.8, 0.2, 0.6, 0.4, 0.8, 0.2, 0.2, 0.6, 0.2, 0.6])

In [14]:
pd.DataFrame({'item': te.columns_,
             'p': aa.sum(axis=0)/len(aa)})

Unnamed: 0,item,p
0,Apple,0.6
1,Beans,0.8
2,Coke,0.2
3,Cookie,0.6
4,Corn,0.4
5,Eggs,0.8
6,Ice cream,0.2
7,Kidney Beans,0.2
8,Milk,0.6
9,Orange,0.2


In [17]:
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)

# 지지도를 기준으로 내림차순
frequent_itemsets
frequent_itemsets.sort_values('support',ascending=False)

Unnamed: 0,support,itemsets
1,0.8,(Beans)
3,0.8,(Eggs)
0,0.6,(Apple)
2,0.6,(Cookie)
4,0.6,(Milk)
5,0.6,(Yogurt)
6,0.6,"(Eggs, Apple)"
7,0.6,"(Beans, Cookie)"
8,0.6,"(Eggs, Beans)"
9,0.6,"(Yogurt, Beans)"


In [21]:
# 향상도가 1.2이상인 패턴을 발견
rules2 = association_rules(frequent_itemsets, metric = 'lift', min_threshold = 1.2)
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Eggs),(Apple),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
1,(Apple),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
2,(Beans),(Cookie),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
3,(Cookie),(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
4,(Yogurt),(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
5,(Beans),(Yogurt),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
6,(Eggs),(Cookie),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
7,(Cookie),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
8,"(Eggs, Beans)",(Cookie),0.6,0.6,0.6,1.0,1.666667,0.24,inf,1.0
9,"(Eggs, Cookie)",(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5


In [45]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

dataset = [('담보', '검객', '국제'),
           ('담보', '국제', '소리'),
           ('검객', '국제', '소리'),
           ('담보', '소리', '뮬란'),
           ('검객', '국제', '뮬란')]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns = te.columns_)
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(국제),(검객),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
1,(검객),(국제),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
2,(국제),(담보),0.8,0.6,0.4,0.5,0.833333,-0.08,0.8,-0.5
3,(담보),(국제),0.6,0.8,0.4,0.666667,0.833333,-0.08,0.6,-0.333333
4,(국제),(소리),0.8,0.6,0.4,0.5,0.833333,-0.08,0.8,-0.5
5,(소리),(국제),0.6,0.8,0.4,0.666667,0.833333,-0.08,0.6,-0.333333
6,(소리),(담보),0.6,0.6,0.4,0.666667,1.111111,0.04,1.2,0.25
7,(담보),(소리),0.6,0.6,0.4,0.666667,1.111111,0.04,1.2,0.25
