目标：探究用户对物品类别的喜好细分

流程：
1. 获取数据
2. 合并表
3. 找到user_id和aisle_id之间的关系
4. PCA降维

In [2]:
import pandas as pd
import numpy as np

1. 获取数据

In [3]:
aisles = pd.read_csv("/data/ys_data/instacart/aisles.csv")
deps = pd.read_csv("/data/ys_data/instacart/departments.csv")
order_products_prior = pd.read_csv("/data/ys_data/instacart/order_products__prior.csv")
order_products_train = pd.read_csv("/data/ys_data/instacart/order_products__train.csv")
order = pd.read_csv("/data/ys_data/instacart/orders.csv")
prods = pd.read_csv("/data/ys_data/instacart/products.csv")
sample_sub = pd.read_csv("/data/ys_data/instacart/sample_submission.csv")

展示数据

In [4]:
print("aisles:\n", aisles.head())
print("deps:\n", deps.head())
print("order_products_prior:\n", order_products_prior.head())
print("order_products_train:\n", order_products_train.head())
print("order:\n", order.head())
print("prods:\n", prods.head())
print("sample_sub:\n", sample_sub.head())

aisles:
    aisle_id                       aisle
0         1       prepared soups salads
1         2           specialty cheeses
2         3         energy granola bars
3         4               instant foods
4         5  marinades meat preparation
deps:
    department_id department
0              1     frozen
1              2      other
2              3     bakery
3              4    produce
4              5    alcohol
order_products_prior:
    order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0
order_products_train:
    order_id  product_id  add_to_cart_order  reordered
0         1       49302                  1          1
1         1       11109                  2          1
2         1       10246                  3  

合并表
1. aisles & prods -> tab1 (aisle_id)
2. tab1 & order_products -> tab2 (product_id)
3. tab2 & order -> res (order_id)

In [5]:
tab1 = pd.merge(aisles, prods, how="inner", on=["aisle_id", "aisle_id"])
tab2 = pd.merge(tab1, order_products_prior, how="inner", on=["product_id", "product_id"])
tab3 = pd.merge(tab2, order, how="inner", on=["order_id", "order_id"])

In [6]:
tab3.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,prepared soups salads,209,Italian Pasta Salad,20,94246,5,0,114082,prior,26,0,20,1.0
1,1,prepared soups salads,22853,Pesto Pasta Salad,20,94246,4,0,114082,prior,26,0,20,1.0
2,4,instant foods,12087,Chicken Flavor Ramen Noodle Soup,9,94246,15,0,114082,prior,26,0,20,1.0
3,4,instant foods,47570,Original Flavor Macaroni & Cheese Dinner,9,94246,14,1,114082,prior,26,0,20,1.0
4,13,prepared meals,10089,Dolmas,20,94246,25,0,114082,prior,26,0,20,1.0


交叉表找user_id和aisle之间的关系

In [7]:
table = pd.crosstab(tab3["user_id"], tab3["aisle"])

In [8]:
table.head()

aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,3,0,0,0,0,2,0,0,0,...,3,1,1,0,0,0,0,2,0,42
3,0,0,0,0,0,0,0,0,0,0,...,4,1,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


PCA降维

In [9]:
from sklearn.decomposition import PCA

In [10]:
transfer = PCA(n_components=0.95)
table_new = transfer.fit_transform(table)

In [11]:
table_new.shape

(206209, 44)

In [12]:
table_new

array([[-24.21565874,   2.4294272 ,  -2.46636975, ...,  -0.08877715,
         -0.38087761,   0.21568831],
       [  6.46320806,  36.75111647,   8.38255336, ...,   1.912145  ,
          1.79468946,  -0.70142249],
       [ -7.99030162,   2.40438257, -11.03006405, ...,  -0.72188348,
         -1.15719089,  -0.23704277],
       ...,
       [  8.61143331,   7.70129866,   7.95240226, ...,   0.23971061,
         -0.78590175,  -2.65945606],
       [ 84.08621987,  20.41873398,   8.05410372, ...,  -1.66893212,
          0.5042934 ,   3.82546312],
       [-13.95345619,   6.64621821,  -5.23030367, ...,  -1.64144758,
         -3.39233648,  -0.31410713]])

In [13]:
# 模型训练
from sklearn.cluster import KMeans
estimator = KMeans(n_clusters=3)

In [14]:
estimator.fit(table_new)

  super()._check_params_vs_input(X, default_n_init=10)


In [15]:
y_predict = estimator.predict(table_new)

In [17]:
y_predict[:300]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0,
       1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], d

In [18]:
# 模型评估 - 轮廓系数（[-1, 1]，趋近于1说明分类效果好，趋近于-1说明效果差）
from sklearn.metrics import silhouette_score

In [19]:
silhouette_score(table_new, y_predict)

0.5366618821483505