## Apriori Algorithm

### 1. 数据预处理

In [1]:
import numpy as np
import pandas as pd
data_2009 = pd.read_excel('retail.xlsx', sheet_name="Year 2009-2010")
data_2010 = pd.read_excel('retail.xlsx', sheet_name="Year 2010-2011")
data = pd.concat([data_2009, data_2010]).reset_index(drop=True)

# data = pd.read_excel('retail.xlsx', sheet_name="Year 2010-2011")

In [2]:
data.shape

(1067371, 8)

#### 缺失值处理

In [3]:
for columnName in data.columns:
    if data[columnName].count() != len(data):
        loc = data[columnName][data[columnName].isnull().values==True].index.tolist()
        print(f"{columnName}中有{len(loc)}个缺失值.")

Description中有4382个缺失值.
Customer ID中有243007个缺失值.


In [4]:
# 丢弃缺失值
data_drop = data.dropna(inplace=False).reset_index(drop=True)
print(data_drop.shape)

(824364, 8)


#### 选取United Kingdom数据


In [5]:
data_UK = data_drop[data_drop["Country"] == "United Kingdom"].reset_index(drop=True)

In [6]:
data_UK.shape

(741301, 8)

#### Invoice中含有字母“C”的被删除

In [7]:
data_UK["Invoice"] = data_UK["Invoice"].astype(str)
data_UK["StockCode"] = data_UK["StockCode"].astype(str)
data_invoice = data_UK[~data_UK['Invoice'].str.contains('C')].reset_index(drop=True)

In [8]:
data_invoice.shape

(725296, 8)

#### 处理价格与销量中的异常值

异常值：位于 1% 和 99% 分位数之外的值。

使用阈值来代替数据中的异常值。

In [9]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [10]:
outlier_cols = ["Quantity", "Price"]
for col in outlier_cols:
    replace_with_thresholds(data_invoice, col)

#### 价格与销量为负数的删去

In [11]:
data_invoice = data_invoice[data_invoice["Quantity"] > 0]
data_price = data_invoice[data_invoice["Price"] > 0].reset_index(drop=True)

In [12]:
data_price.shape

(725250, 8)

#### 如果一个StockCode对应的Description存在不一致的情况，则删除有该StockCode的所有数据行

In [13]:
data_stock = data_price.groupby("StockCode")["Description"].nunique()

In [14]:
Index_over = data_stock[data_stock > 1].index.to_list()

In [15]:
data_stockcode = data_price[~(data_price["StockCode"].isin(Index_over))].reset_index(drop=True)
data_stockcode.shape

(551392, 8)

#### 删除StockCode中的POST

In [16]:
data_finalstock = data_stockcode[data_stockcode["StockCode"] != "POST"].reset_index(drop=True)
data_finalstock.shape

(551356, 8)

In [17]:
data_finalstock.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12.0,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12.0,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48.0,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
3,489434,22064,PINK DOUGHNUT TRINKET POT,24.0,2009-12-01 07:45:00,1.65,13085.0,United Kingdom
4,489434,21871,SAVE THE PLANET MUG,24.0,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


#### 删除其中对分析关联规则无用的列

In [18]:
data_simple = data_finalstock.drop(columns=["Description", "Quantity", "InvoiceDate", "Price", "Customer ID", "Country"])
data_simple.head()

Unnamed: 0,Invoice,StockCode
0,489434,85048
1,489434,79323P
2,489434,22041
3,489434,22064
4,489434,21871


### 2. 格式转化
调整格式，从而在之后便于使用Apriori模块。

In [25]:
data_trans = data_simple.groupby("Invoice")["StockCode"].apply(tuple).reset_index(drop=True)
list_trans = data_trans.values.tolist()

In [20]:
len(list_trans)

32643

### 3. 寻找关联规则
调用Apriori模块。

In [21]:
from efficient_apriori import apriori

itemsets, rules = apriori(list_trans, min_support=0.01)

In [22]:
rule_supports = [(rule, rule.support) for rule in rules]
sorted_rules = sorted(rule_supports, key=lambda x: x[1], reverse=True)
top_five_rules = sorted_rules[:5]
for rule, support in top_five_rules:
    print(f"Rule: {rule}")

Rule: {82494L} -> {82482} (conf: 0.565, supp: 0.030, lift: 11.457, conv: 2.188)
Rule: {82482} -> {82494L} (conf: 0.603, supp: 0.030, lift: 11.457, conv: 2.388)
Rule: {21755} -> {21754} (conf: 0.528, supp: 0.023, lift: 9.755, conv: 2.005)
Rule: {22910} -> {22086} (conf: 0.556, supp: 0.020, lift: 11.099, conv: 2.139)
Rule: {22727} -> {22726} (conf: 0.608, supp: 0.019, lift: 21.620, conv: 2.479)
