In [1]:
# 資料加工、處理、分析函式庫
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

# 視覺化函式庫
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機器學習函式庫
import sklearn

# 顯示到小數點後第3位
%precision 3

'%.3f'

#### 9-4-1 何謂購物籃分析?
意指如果購買商品A也會購買商品B的情況，分析購買商品時的關聯性。

#### 9-4-2 讀取用來進行購物籃分析的樣本資料

In [5]:
# 讀取方法1
trans = pd.ExcelFile('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx').parse('Online Retail')
trans.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [9]:
# 讀取方法2
trans2 = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx', sheet_name = 'Online Retail')
trans2.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [10]:
# 將InvoiceNo開頭的第一個字作為cancel_flg追加
trans['cancel_flg'] = trans.InvoiceNo.map(lambda x:str(x)[0])

# 以cancel_flg分群統計
trans.groupby('cancel_flg').size()+

cancel_flg
5    532618
A         3
C      9288
dtype: int64

In [11]:
trans = trans[(trans.cancel_flg == '5') & (trans.CustomerID.notnull())]

#### 9-4-3 關聯規則

In [12]:
# 依據StockCode分別計算筆數，顯示前5件
trans['StockCode'].value_counts().head()

85123A    2035
22423     1724
85099B    1618
84879     1408
47566     1397
Name: StockCode, dtype: int64

支持度:某個商品與其他商品一起賣出的購物籃數量，以及佔全體當中的比例。

In [16]:
# 將所有的InvoiceNo抽出為trans_all
trans_all = set(trans.InvoiceNo)

# 將購入商品85123A的資料抽出為train_a
trans_a = set(trans[trans['StockCode'] == '85123A'].InvoiceNo)
print(len(trans_a))

# 將購入商品85099B的資料抽出為train_b
trans_b = set(trans[trans['StockCode'] == '85099B'].InvoiceNo)
print(len(trans_b))

# 將購入85123A與85099B的資料置於train_ab
trans_ab = trans_a & trans_b # 取交集
print(len(train_ab))

1978
1600
252


In [17]:
# 顯示train_ab，亦即包含兩商品的購物籃
print('含有兩商品的購物籃數量:{}'.format(len(trans_ab)))
print('含有兩商品的購物籃佔全體的比例:{}'.format(len(trans_ab) / len(trans_all)))

含有兩商品的購物籃數量:252
含有兩商品的購物籃佔全體的比例:0.013595166163141994


In [18]:
print('含有商品85123A的購物籃數量:{}'.format(len(trans_a)))
print('含有商品85123A的購物籃佔全體的比例:{}'.format(len(trans_a) / len(trans_all)))

含有商品85123A的購物籃數量:1978
含有商品85123A的購物籃佔全體的比例:0.10671126456624946


可信度:基於某個商品A的購買數量，表現該商品A與商品B組合購買的佔了其中多少比例。

In [20]:
# 以基於購買商品85123A，也購買85099B
print('可信度:{:.3f}'.format(len(trans_ab) / len(trans_a)))

可信度:0.127


In [21]:
# 以基於購買商品85099B，也購買85123A
print('可信度:{:.3f}'.format(len(trans_ab) / len(trans_b)))

可信度:0.158


增益值:關於購買商品A也購買商品B，將規則的可信度除以商品B的支持度。

In [22]:
# 計算佔了全體購物籃的商品B之購買率
supper_b = len(trans_b) / len(trans_all)

# 計算購買了商品A時商品B的購買率
confidence = len(trans_ab) / len(trans_a)

# 計算增益值
lift = confidence / supper_b
print('lift:{:.3f}'.format(lift))

lift:1.476


當可信度的數值相當高，增益值卻低於1.0時，以此當作推薦給顧客的根據也許並不適當。

### 第9章 綜合習題

In [28]:
import itertools

# 取出記錄大於1000的StockCode
indexer = trans.StockCode.value_counts() > 1000
Items = trans.StockCode.value_counts()[indexer].index

# 取得包含於統計對象紀錄裡的InvoiceNo之數量(支持度的分母)
trans_all = set(trans.InvoiceNo)

# 對包含於對象Items裡的任意2個StockCode組合分別計算支持度
results = {}
for element in itertools.combinations(Items, 2):
    trans_0 = set(trans[trans['StockCode'] == element[0]].InvoiceNo)
    trans_1 = set(trans[trans['StockCode'] == element[1]].InvoiceNo)
    trans_both = trans_0 & trans_1

    support = len(trans_both) / len(trans_all)
    results[element] = support

maxkey = max([(v,k) for k,v in results.items()])[1]
print('支持度最大的StockCode組合:{}'.format(maxkey))
print('支持度最大值:{:.4f}'.format(results[maxkey]))

支持度最大的StockCode組合:(20725, 22383)
支持度最大值:0.0280


In [29]:
-

85123A     True
22423      True
85099B     True
84879      True
47566      True
          ...  
90168     False
90169     False
90202A    False
90118     False
23843     False
Name: StockCode, Length: 3665, dtype: bool

Index(['85123A',    22423, '85099B',    84879,    47566,    20725,    22720,
          20727,   'POST',    23203,    22383,    21212,    22197,    23209,
          23298,    22086,    22382],
      dtype='object')