# 基于FP-Growth（Apriori）关联分析算法的商品购物篮分析

# 一、数据介绍

* Member_number：用户ID  
* Date：日期  
* itemDescription：商品名称

# 二、分析概述

# 三、数据预处理

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws,collect_list
from pyspark.ml.fpm import FPGrowth

In [3]:
# 创建 SparkSession
spark = SparkSession.builder.appName("FP-Growth").master("local[*]").getOrCreate()

In [4]:
# 读取 CSV 数据
# header = True 代表读入数据的第一行是列名
# inferSchema=True 自动推断每列数据的数据类型，设为False就都当成字符串类型了
data = spark.read.csv("Groceries_dataset.csv", header=True, inferSchema=True)

In [5]:
data.show()

+-------------+----------+--------------------+
|Member_number|      Date|     itemDescription|
+-------------+----------+--------------------+
|         1808|21-07-2015|      tropical fruit|
|         2552|05-01-2015|          whole milk|
|         2300|19-09-2015|           pip fruit|
|         1187|12-12-2015|    other vegetables|
|         3037|01-02-2015|          whole milk|
|         4941|14-02-2015|          rolls/buns|
|         4501|08-05-2015|    other vegetables|
|         3803|23-12-2015|          pot plants|
|         2762|20-03-2015|          whole milk|
|         4119|12-02-2015|      tropical fruit|
|         1340|24-02-2015|        citrus fruit|
|         2193|14-04-2015|                beef|
|         1997|21-07-2015|         frankfurter|
|         4546|03-09-2015|             chicken|
|         4736|21-07-2015|              butter|
|         1959|30-03-2015|fruit/vegetable j...|
|         1974|03-05-2015|packaged fruit/ve...|
|         2421|02-09-2015|           cho

In [6]:
data.count()

38765

In [7]:
data.dtypes

[('Member_number', 'int'), ('Date', 'string'), ('itemDescription', 'string')]

## 缺失值处理

In [10]:
# 计算DataFrame中每一列的缺失率
def calculate_missing_rates(df):
    """
    计算每列的缺失率（为 null 和 空格 都认为是缺失值）

    Parameters:
    - df: PySpark DataFrame

    Returns:
    - missing_rates_df: PySpark DataFrame，包含每列的缺失率
    """
    total_rows = df.count()
    missing_rates = []

    for column in df.columns:
        missing_count = df.filter(col(column).isNull() | (col(column) == "") | (col(column) == " ")).count()
        missing_rate = (missing_count / total_rows) * 100
        missing_rates.append((column, missing_rate))

    # 创建 DataFrame 显示结果
    missing_rates_df = spark.createDataFrame(missing_rates, ["Column", "MissingRate"])
    return missing_rates_df

In [13]:
missing_rates = calculate_missing_rates(data)

In [14]:
missing_rates.show()

+---------------+-----------+
|         Column|MissingRate|
+---------------+-----------+
|  Member_number|        0.0|
|           Date|        0.0|
|itemDescription|        0.0|
+---------------+-----------+



* 本数据集中不存在缺失值

In [17]:
# 删除数据中的重复数据，由于后续要使用FP-Growth，FP-Growth只关注于用户和item两列，并且不敏感于item的排列顺序，
# 故只针对Member_number 和 itemDescription两列进行去重操作
data = data.select("Member_number", "itemDescription").dropDuplicates()

In [18]:
data.count()

34766

* 总计34766条数据参与构建FP-Growth模型

# 四、数据建模

In [20]:
# 构造参与构建FP-Growth模型的DataFrame
grouped_df = data.groupBy("Member_number").agg( collect_list("itemDescription").alias("items"))

In [21]:
grouped_df.show()

+-------------+--------------------+
|Member_number|               items|
+-------------+--------------------+
|         1000|[hygiene articles...|
|         1001|[whipped/sour cre...|
|         1002|[other vegetables...|
|         1003|[frozen meals, de...|
|         1004|[tropical fruit, ...|
|         1005|[whipped/sour cre...|
|         1006|[bottled water, b...|
|         1008|[liver loaf, dome...|
|         1009|[tropical fruit, ...|
|         1010|[candles, pip fru...|
|         1011|[citrus fruit, he...|
|         1012|[tropical fruit, ...|
|         1013|[bottled water, t...|
|         1014|[whole milk, butt...|
|         1015|[citrus fruit, ro...|
|         1016|[mayonnaise, red/...|
|         1017|[yogurt, soda, ro...|
|         1018|[curd, root veget...|
|         1019|[hamburger meat, ...|
|         1020|[canned beer, spi...|
+-------------+--------------------+
only showing top 20 rows



In [22]:
grouped_df.count()

3898

In [23]:
grouped_df.dtypes

[('Member_number', 'int'), ('items', 'array<string>')]

In [40]:
# 构造FPGrowth模型
# minSupport（最小支持度）：支持度是指在数据集中出现某个项集的频率，即包含该项集的事务数占总事务数的比例。
#                           只有支持度大于或等于 minSupport 的项集才会被认为是频繁的

# minConfidence（最小置信度）：置信度是关联规则中的一个度量，表示在前提条件下出现后项的概率。
#                           只有置信度大于或等于 minConfidence 的关联规则才会被认为是强关联规则。
fp_growth = FPGrowth(itemsCol="items", minSupport=0.15, minConfidence=0.5)

In [41]:
model = fp_growth.fit(grouped_df)

In [42]:
model.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|           [sausage]| 803|
|         [pip fruit]| 665|
|        [rolls/buns]|1363|
|[rolls/buns, whol...| 696|
|    [tropical fruit]| 911|
|      [bottled beer]| 619|
|            [yogurt]|1103|
|[yogurt, whole milk]| 587|
|       [canned beer]| 644|
|     [bottled water]| 833|
|  [other vegetables]|1468|
|[other vegetables...| 746|
|            [pastry]| 692|
|      [citrus fruit]| 723|
|     [shopping bags]| 656|
|              [soda]|1222|
|  [soda, whole milk]| 589|
|   [root vegetables]| 899|
|[whipped/sour cream]| 603|
|        [whole milk]|1786|
+--------------------+----+



In [43]:
# 查看频繁项集
model.freqItemsets.orderBy("freq", ascending=False).show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|        [whole milk]|1786|
|  [other vegetables]|1468|
|        [rolls/buns]|1363|
|              [soda]|1222|
|            [yogurt]|1103|
|    [tropical fruit]| 911|
|   [root vegetables]| 899|
|     [bottled water]| 833|
|           [sausage]| 803|
|[other vegetables...| 746|
|      [citrus fruit]| 723|
|[rolls/buns, whol...| 696|
|            [pastry]| 692|
|         [pip fruit]| 665|
|     [shopping bags]| 656|
|       [canned beer]| 644|
|      [bottled beer]| 619|
|[whipped/sour cream]| 603|
|  [soda, whole milk]| 589|
|[yogurt, whole milk]| 587|
+--------------------+----+



* 在这份数据集中，买 whole milk 、other vegetables 和 rolls/buns 的顾客最多。

In [44]:
# 获取关联规则
model.associationRules.show(30)

+------------------+------------+------------------+------------------+-------------------+
|        antecedent|  consequent|        confidence|              lift|            support|
+------------------+------------+------------------+------------------+-------------------+
|[other vegetables]|[whole milk]|0.5081743869209809|1.1091062487222754| 0.1913801949717804|
|          [yogurt]|[whole milk]|0.5321849501359928|1.1615100423460805|0.15059004617752694|
|      [rolls/buns]|[whole milk]|0.5106382978723404|1.1144838102499344|0.17855310415597742|
+------------------+------------+------------------+------------------+-------------------+



当最小支持度为0.15，最小置信度为0.5的条件下  
* 顾客同时购买other vegetables、whole milk的概率为19%。  
* 顾客同时购买yogurt、whole milk的概率为15%。  
* 顾客同时购买rolls/buns、whole milk的概率为18%。