# 案例：Instacart市场篮子分析
- 通过物品类别的喜好对Instacart Market用户进行聚类（kaggle）
- 这里用PCA、K-means聚类

# 一、数据处理部分

**流程分析：**
1. 需要将 user_id 和 aisle（物品类别）放到同一个表：合并
2. 找到 user_id 和 aisle 的关系：交叉表、透视表
3. 去除冗余特征：这里用PCA降维

In [1]:
import pandas as pd
from sklearn.decomposition import PCA

## 1、读取数据

In [2]:
aisles = pd.read_csv("./day1资料/02-代码/instacart/aisles.csv")
order_products__prior = pd.read_csv("./day1资料/02-代码/instacart/order_products__prior.csv")
orders = pd.read_csv("./day1资料/02-代码/instacart/orders.csv")
products = pd.read_csv("./day1资料/02-代码/instacart/products.csv")

In [3]:
aisles.columns # 需要aisle

Index(['aisle_id', 'aisle'], dtype='object')

In [4]:
order_products__prior.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')

In [5]:
orders.columns # 需要user_id

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [6]:
products.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id'], dtype='object')

## 2、合并表
- 找各个表的共同列索引，逐个合并，最终将 user_id 和 aisle 合并在一个表中

In [7]:
# 需要aisle，注意到aisle_id也在products中
# 合并aisles和products

tab1 = pd.merge(aisles, products, how='inner', on=['aisle_id'])
tab1.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id
0,1,prepared soups salads,209,Italian Pasta Salad,20
1,1,prepared soups salads,554,Turkey Chili,20
2,1,prepared soups salads,886,Whole Grain Salad with Roasted Pecans & Mango ...,20
3,1,prepared soups salads,1600,Mediterranean Orzo Salad,20
4,1,prepared soups salads,2539,Original Potato Salad,20


In [8]:
# product_id也在order_products__prior中
# 合并tab1和order_products__prior

tab2 = pd.merge(tab1, order_products__prior, how='inner', on=['product_id'])
tab2.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id,order_id,add_to_cart_order,reordered
0,1,prepared soups salads,209,Italian Pasta Salad,20,94246,5,0
1,1,prepared soups salads,209,Italian Pasta Salad,20,192465,2,1
2,1,prepared soups salads,209,Italian Pasta Salad,20,195206,18,1
3,1,prepared soups salads,209,Italian Pasta Salad,20,227717,1,1
4,1,prepared soups salads,209,Italian Pasta Salad,20,260072,13,0


In [9]:
# order_id也在orders中
# 合并tab2和orders，得到的tab_final里面包含aisle和user_id，完成合并任务

tab_final = pd.merge(tab2, orders, how='inner', on=['order_id'])
tab_final.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,prepared soups salads,209,Italian Pasta Salad,20,94246,5,0,114082,prior,26,0,20,1.0
1,1,prepared soups salads,209,Italian Pasta Salad,20,192465,2,1,119977,prior,2,0,16,3.0
2,1,prepared soups salads,209,Italian Pasta Salad,20,195206,18,1,1519,prior,7,2,9,5.0
3,1,prepared soups salads,209,Italian Pasta Salad,20,227717,1,1,161125,prior,7,2,11,11.0
4,1,prepared soups salads,209,Italian Pasta Salad,20,260072,13,0,12012,prior,5,5,11,11.0


In [10]:
tab_final.shape

(32434489, 14)

## 3、找到关系
- 交叉表和透视表

In [11]:
relation = pd.crosstab(tab_final['user_id'], tab_final['aisle']) # 先行索引，后列索引
relation.head()

aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,3,0,0,0,0,2,0,0,0,...,3,1,1,0,0,0,0,2,0,42
3,0,0,0,0,0,0,0,0,0,0,...,4,1,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [12]:
relation.shape

(206209, 134)

## 4、PCA降维

In [13]:
transfer = PCA(n_components=0.95) # 保存95%的信息
rela_new = transfer.fit_transform(relation)
rela_new

array([[-24.21565874,  -2.4294272 ,  -2.46636975, ...,   0.08877715,
          0.38087761,   0.21568831],
       [  6.46320806, -36.75111647,   8.38255336, ...,  -1.912145  ,
         -1.79468946,  -0.70142249],
       [ -7.99030162,  -2.40438257, -11.03006405, ...,   0.72188348,
          1.15719089,  -0.23704277],
       ...,
       [  8.61143331,  -7.70129866,   7.95240226, ...,  -0.23971061,
          0.78590175,  -2.65945606],
       [ 84.08621987, -20.41873398,   8.05410372, ...,   1.66893212,
         -0.5042934 ,   3.82546312],
       [-13.95345619,  -6.64621821,  -5.23030367, ...,   1.64144758,
          3.39233648,  -0.31410713]], shape=(206209, 44))

In [14]:
rela_new.shape # 成功降维

(206209, 44)

# 二、聚类部分

**流程分析：**
- 已经有降维后的数据（没有标签值，不用划分数据集）
1. 训练聚类模型
2. 模型评估

In [15]:
rela_new

array([[-24.21565874,  -2.4294272 ,  -2.46636975, ...,   0.08877715,
          0.38087761,   0.21568831],
       [  6.46320806, -36.75111647,   8.38255336, ...,  -1.912145  ,
         -1.79468946,  -0.70142249],
       [ -7.99030162,  -2.40438257, -11.03006405, ...,   0.72188348,
          1.15719089,  -0.23704277],
       ...,
       [  8.61143331,  -7.70129866,   7.95240226, ...,  -0.23971061,
          0.78590175,  -2.65945606],
       [ 84.08621987, -20.41873398,   8.05410372, ...,   1.66893212,
         -0.5042934 ,   3.82546312],
       [-13.95345619,  -6.64621821,  -5.23030367, ...,   1.64144758,
          3.39233648,  -0.31410713]], shape=(206209, 44))

## 1、训练聚类模型

In [16]:
from sklearn.cluster import KMeans

In [17]:
esti = KMeans(n_clusters=3) # K=3，先分3类
esti.fit(rela_new) # 训练，只用传特征值，没有标签值

In [18]:
y_pred = esti.predict(rela_new) # 聚类预测
y_pred[:300] # 查看聚类效果

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2,
       2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0,
       2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0,
       0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2], d

## 2、模型评估

In [19]:
from sklearn.metrics import silhouette_score

In [20]:
silhouette_score(rela_new, y_pred)

np.float64(0.5373818482995539)