## Apriori Algorithm

### 1. 数据预处理

In [29]:
import numpy as np
import pandas as pd
# data_2009 = pd.read_excel('retail.xlsx', sheet_name="Year 2009-2010")
# data_2010 = pd.read_excel('retail.xlsx', sheet_name="Year 2010-2011")
# data = pd.concat([data_2009, data_2010]).reset_index(drop=True)

data = pd.read_excel('retail.xlsx', sheet_name="Year 2010-2011")

In [30]:
data.shape

(541910, 8)

#### 缺失值处理

In [31]:
for columnName in data.columns:
    if data[columnName].count() != len(data):
        loc = data[columnName][data[columnName].isnull().values==True].index.tolist()
        print(f"{columnName}中有{len(loc)}个缺失值.")

Description中有1454个缺失值.
Customer ID中有135080个缺失值.


In [32]:
# 丢弃缺失值
data_drop = data.dropna(inplace=False).reset_index(drop=True)
print(data_drop.shape)

(406830, 8)


#### 选取United Kingdom数据


In [33]:
data_UK = data_drop[data_drop["Country"] == "United Kingdom"].reset_index(drop=True)

In [34]:
data_UK.shape

(361878, 8)

#### Invoice中含有字母“C”的被删除

In [35]:
data_UK["Invoice"] = data_UK["Invoice"].astype(str)
data_UK["StockCode"] = data_UK["StockCode"].astype(str)
data_invoice = data_UK[~data_UK['Invoice'].str.contains('C')].reset_index(drop=True)

In [36]:
data_invoice.shape

(354345, 8)

#### 价格与销量为负数的删去

In [37]:
data_invoice = data_invoice[data_invoice["Quantity"] > 0]
data_price = data_invoice[data_invoice["Price"] > 0].reset_index(drop=True)

In [38]:
data_price.shape

(354321, 8)

#### 如果一个StockCode对应的Description存在不一致的情况，则删除有该StockCode的所有数据行

In [39]:
data_stock = data_price.groupby("StockCode")["Description"].nunique()

In [40]:
Index_over = data_stock[data_stock > 1].index.to_list()

In [41]:
data_stockcode = data_price[~(data_price["StockCode"].isin(Index_over))].reset_index(drop=True)
data_stockcode.shape

(322421, 8)

#### 删除StockCode中的POST

In [42]:
data_finalstock = data_stockcode[data_stockcode["StockCode"] != "POST"].reset_index(drop=True)
data_finalstock.shape

(322394, 8)

In [43]:
data_finalstock.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
1,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
3,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
4,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom


#### 删除其中对分析关联规则无用的列

In [44]:
data_simple = data_finalstock.drop(columns=["Description", "Quantity", "InvoiceDate", "Price", "Customer ID", "Country"])
data_simple.head()

Unnamed: 0,Invoice,StockCode
0,536365,84406B
1,536365,84029G
2,536365,84029E
3,536365,22752
4,536365,21730


### 2. 格式转化
调整格式，从而在之后便于使用Apriori模块。

In [45]:
data_trans = data_simple.groupby("Invoice")["StockCode"].apply(tuple).reset_index(drop=True)
list_trans = data_trans.values.tolist()

In [46]:
len(list_trans)

16468

In [59]:
from efficient_apriori import apriori

itemsets, rules = apriori(list_trans, min_support=0.01)

In [65]:
rule_supports = [(rule, rule.support) for rule in rules]
sorted_rules = sorted(rule_supports, key=lambda x: x[1], reverse=True)
top_five_rules = sorted_rules[:5]
for rule, support in top_five_rules:
    print(f"Rule: {rule}, Support: {support}")

Rule: {22386} -> {85099B} (conf: 0.623, supp: 0.031, lift: 7.092, conv: 2.420), Support: 0.030726256983240222
Rule: {22699} -> {22697} (conf: 0.702, supp: 0.029, lift: 18.892, conv: 3.232), Support: 0.02890454214233665
Rule: {22697} -> {22699} (conf: 0.778, supp: 0.029, lift: 18.892, conv: 4.315), Support: 0.02890454214233665
Rule: {22384} -> {20725} (conf: 0.555, supp: 0.029, lift: 8.167, conv: 2.096), Support: 0.028600923002186057
Rule: {23301} -> {23300} (conf: 0.617, supp: 0.028, lift: 16.212, conv: 2.513), Support: 0.02781151323779451
