In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.metrics.cluster import (
    silhouette_score,  # bigger is better
    calinski_harabasz_score,  # bigger is better
    davies_bouldin_score,  # smaller is better
)

# 获取数据

In [2]:
order_products_prior = pd.read_csv("../data/instacart/order_products_prior.csv")
products = pd.read_csv("../data/instacart/products.csv")
orders = pd.read_csv("../data/instacart/orders.csv")
aisles = pd.read_csv("../data/instacart/aisles.csv")
order_products_prior.shape, products.shape, orders.shape, aisles.shape

((32434489, 4), (49688, 4), (3421083, 7), (134, 2))

In [3]:
order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [4]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [5]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


# 数据基本处理
## merge合并表格

In [7]:
# 合并前2张表
order_products_prior_products = pd.merge(
    order_products_prior, products, on=["product_id"]
)
order_products_prior_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16


In [8]:
# 合并第3张表
order_products_prior_products_orders = pd.merge(
    order_products_prior_products, orders, on=["order_id"]
)
order_products_prior_products_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,Organic Egg Whites,86,16,202279,prior,3,5,9,8.0
1,2,28985,2,1,Michigan Organic Kale,83,4,202279,prior,3,5,9,8.0
2,2,9327,3,0,Garlic Powder,104,13,202279,prior,3,5,9,8.0
3,2,45918,4,1,Coconut Butter,19,13,202279,prior,3,5,9,8.0
4,2,30035,5,0,Natural Sweetener,17,13,202279,prior,3,5,9,8.0


In [9]:
# 合并最后1张表
table = pd.merge(order_products_prior_products_orders, aisles, on=["aisle_id"])
table.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle
0,2,33120,1,1,Organic Egg Whites,86,16,202279,prior,3,5,9,8.0,eggs
1,26,33120,5,0,Organic Egg Whites,86,16,153404,prior,2,0,16,7.0,eggs
2,120,33120,13,0,Organic Egg Whites,86,16,23750,prior,11,6,8,10.0,eggs
3,327,33120,5,1,Organic Egg Whites,86,16,58707,prior,21,6,9,8.0,eggs
4,390,33120,28,1,Organic Egg Whites,86,16,166654,prior,48,0,12,9.0,eggs


In [10]:
table.shape

(32434489, 14)

## 交叉表合并

In [11]:
table_cross = pd.crosstab(table["user_id"], table["aisle"])
table_cross.shape

(206209, 134)

## 数据截取

In [17]:
table_train = table_cross[:1000]
table_train.head()

aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,3,0,0,0,0,2,0,0,0,...,3,1,1,0,0,0,0,2,0,42
3,0,0,0,0,0,0,0,0,0,0,...,4,1,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


# 特征工程 PCA

In [18]:
# n_components小数代表保留百分之多少的信息,整数代表降低到几维
pca = PCA(n_components=0.9)

In [39]:
table_pca = pca.fit_transform(table_train)
table_pca.shape

(1000, 22)

# 聚类

## KMeans

In [42]:
kmeans = KMeans(n_clusters=8, random_state=0)

In [43]:
kmeans_y_predict = kmeans.fit_predict(table_pca)
kmeans_y_predict[:100]



array([0, 4, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0,
       4, 3, 4, 0, 0, 0, 0, 0, 0, 0, 4, 0])

### 评估

In [44]:
print(silhouette_score(table_pca, kmeans_y_predict))  # bigger is better
print(calinski_harabasz_score(table_pca, kmeans_y_predict))  # bigger is better
print(davies_bouldin_score(table_pca, kmeans_y_predict))  # smaller is better

0.4485421627503111
216.4035460707327
1.3097004015273708


## DBSCAN

In [57]:
dbscan = DBSCAN(eps=100)

In [58]:
dbscan_y_predict = dbscan.fit_predict(table_pca)
dbscan_y_predict[:100]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

### 评估

In [59]:
print(silhouette_score(table_pca, dbscan_y_predict))  # bigger is better
print(calinski_harabasz_score(table_pca, dbscan_y_predict))  # bigger is better
print(davies_bouldin_score(table_pca, dbscan_y_predict))  # smaller is better

0.7790699167143196
178.23841525583944
1.5594869115792767


## OPTICS

In [73]:
optics = OPTICS(min_cluster_size=0.1)

In [74]:
optics_y_predict = optics.fit_predict(table_pca)
optics_y_predict[:100]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

### 评估

In [75]:
print(silhouette_score(table_pca, optics_y_predict))  # bigger is better
print(calinski_harabasz_score(table_pca, optics_y_predict))  # bigger is better
print(davies_bouldin_score(table_pca, optics_y_predict))  # smaller is better

0.76065776922134
287.920013835662
1.418633155114779
