# 연관분석(Apriori)

In [54]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import  matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
plt.style.use("ggplot")
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

import warnings
warnings.filterwarnings("ignore")

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

from statsmodels.sandbox.stats.runs import Runs

## 데이터 불러오기

In [12]:
# Data Load
grocery = pd.read_csv('data/Groceries_dataset.csv')
print('grocery.shape :',grocery.shape)
grocery.head()

grocery.shape : (38765, 3)


Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [13]:
# Data Information
grocery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


## 트랜잭션 데이터 생성

In [53]:
# 트랜잭션 데이터 생성 함수 : One-Hot Encoding Method
'''
dataset = [
    ['아메리카노', '카페라떼'],
    ['카페라떼', '아메리카노', '카푸치노'],
    ['바닐라라떼', '아메리카노'],
]'''
# te = TransactionEncoder()
# te_ary = te.fit(dataset).transform(dataset)

# df = pd.DataFrame(te_ary, columns=te.columns_)

"\ndataset = [\n    ['아메리카노', '카페라떼'],\n    ['카페라떼', '아메리카노', '카푸치노'],\n    ['바닐라라떼', '아메리카노'],\n]"

In [15]:
# One-Hot Encoding
oh_item = pd.get_dummies(grocery['itemDescription'])
print("oh_item.shape :",oh_item.shape)
oh_item.head()

oh_item.shape : (38765, 167)


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
# One-Hot data concatenation
gr_df = grocery.copy()
gr_df.drop('itemDescription', inplace=True, axis=1)
gr_df = pd.concat([gr_df,oh_item],axis=1)
gr_df.head()

Unnamed: 0,Member_number,Date,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,21-07-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,05-01-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,19-09-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,12-12-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,01-02-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [32]:
# 트랜잭션 데이터셋 생성
te = gr_df.groupby(['Member_number','Date']).sum()
te.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000,15-03-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1000,24-06-2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000,24-07-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,25-11-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,27-05-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# 트랜잭션 데이터를 Apriori 알고리즘에 적합한 형태로 변형
te = te.reset_index(drop=False).drop(['Member_number','Date'],axis=1)
te = te.applymap(lambda x : 1 if x >= 1 else 0)
te_ary = te.to_numpy()
print(te_ary.shape)
te_ary

(14963, 167)


array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## 연관분석 Run-test

In [56]:
Runs(te_ary[0]).runs_test()

(-3.195523082222796, 0.0013957761783879039)

### 연관규칙 계산 : Apriori Algorithm

In [46]:
# 최소 지지도를 넘는 빈발항목집합(frequent item set) 추출,
itemset = apriori(df=te, min_support=0.001, use_colnames=True, verbose=True, max_len=2)
itemset

Processing 22052 combinations | Sampling itemset size 2


Unnamed: 0,support,itemsets
0,0.004010,(Instant food products)
1,0.021386,(UHT-milk)
2,0.001470,(abrasive cleaner)
3,0.001938,(artif. sweetener)
4,0.008087,(baking powder)
...,...,...
736,0.002941,"(whipped/sour cream, yogurt)"
737,0.003141,"(whole milk, white bread)"
738,0.001069,"(yogurt, white bread)"
739,0.001270,"(white wine, whole milk)"


In [51]:
# 신뢰도 0.1 이상 
association_rules(itemset, metric='confidence', min_threshold=0.1).sort_values('confidence',ascending=False)[:10]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
99,(semi-finished bread),(whole milk),0.00949,0.157923,0.001671,0.176056,1.114825,0.000172,1.022008
30,(detergent),(whole milk),0.008621,0.157923,0.001403,0.162791,1.030824,4.2e-05,1.005814
46,(ham),(whole milk),0.017109,0.157923,0.00274,0.160156,1.014142,3.8e-05,1.002659
9,(bottled beer),(whole milk),0.045312,0.157923,0.007151,0.157817,0.99933,-5e-06,0.999874
37,(frozen fish),(whole milk),0.006817,0.157923,0.001069,0.156863,0.993287,-7e-06,0.998743
14,(candy),(whole milk),0.014369,0.157923,0.002139,0.148837,0.942468,-0.000131,0.989326
98,(sausage),(whole milk),0.060349,0.157923,0.008955,0.148394,0.939663,-0.000575,0.988811
67,(onions),(whole milk),0.02025,0.157923,0.002941,0.145215,0.919528,-0.000257,0.985133
88,(processed cheese),(rolls/buns),0.010158,0.110005,0.00147,0.144737,1.315734,0.000353,1.04061
90,(processed cheese),(whole milk),0.010158,0.157923,0.00147,0.144737,0.916503,-0.000134,0.984582
