In [86]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score


## 数据准备

In [87]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
X = X.loc[:, ["petal length (cm)", "petal width (cm)"]]
y = pd.Series(iris.target)
X.head()

Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2


## FeatureUnion 分别查看 linear PCA 与 kernel PCA 生成特征数


In [88]:
fu_list = [("linear_pca", PCA()), ("kernel_pca", KernelPCA(kernel="rbf"))]
combined = FeatureUnion(transformer_list=fu_list, n_jobs=-1)
# 注意：combined.set_params 设置直接生效 , 不需要 combined = combined.set_params() 
print("linear PCA and kernel PCA", combined.fit_transform(X).shape)
print("------------------------------------")
print("linear PCA only", combined.set_params(linear_pca=PCA(), kernel_pca=None).fit_transform(X).shape)
print("------------------------------------")
print("kernel PCA only", combined.set_params(linear_pca=None, kernel_pca=KernelPCA(kernel="rbf")).fit_transform(X).shape)


linear PCA and kernel PCA (150, 130)
------------------------------------


linear PCA only (150, 2)
------------------------------------


kernel PCA only (150, 128)


## FeatureUnion 嵌入 Pipeline 只选择 kernel PCA

In [None]:
combined.set_params(linear_pca=None, kernel_pca=KernelPCA(kernel="rbf"))

pl_list = [("combined", combined), ("KMeans", KMeans(n_clusters=3))]
pipe = Pipeline(pl_list)

pd.DataFrame(data={"y^":pipe.fit_predict(X), "y":y}).to_csv("Sklearn/FeatureUnion/kernel_pca.csv", 
                                                            index= False)

## FeatureUnion 嵌入 Pipeline 只选择 linear PCA

In [None]:
combined.set_params(linear_pca=PCA(), kernel_pca=None)

pl_list = [("combined", combined), ("KMeans", KMeans(n_clusters=3))]
pipe = Pipeline(pl_list)

pd.DataFrame(data={"y^":pipe.fit_predict(X), "y":y}).to_csv("Sklearn/FeatureUnion/linear_pca.csv", 
                                                            index= False)


## FeatureUnion 嵌入 Pipeline 选择 kernel PCA 与 linear PCA

In [None]:
combined.set_params(linear_pca=PCA(), kernel_pca=KernelPCA(kernel="rbf"))

pl_list = [("combined", combined), ("KMeans", KMeans(n_clusters=3))]
pipe = Pipeline(pl_list)

pd.DataFrame(data={"y^":pipe.fit_predict(X), "y":y}).to_csv("Sklearn/FeatureUnion/kernel_linear_pca.csv", 
                                                            index= False)


## 不选择 kernel PCA 与 linear PCA 直接聚类


In [113]:
pd.DataFrame(data={"y^":KMeans(n_clusters=3).fit_predict(X), "y":y}).to_csv("Sklearn/FeatureUnion/no_pca.csv", 
                                                                            index= False)

## 评估聚类效果

In [114]:
path = ["C:/Users/YL/PycharmProjects/Python_Study_Note/Sklearn/FeatureUnion/linear_pca.csv",
        "C:/Users/YL/PycharmProjects/Python_Study_Note/Sklearn/FeatureUnion/kernel_pca.csv",
        "C:/Users/YL/PycharmProjects/Python_Study_Note/Sklearn/FeatureUnion/kernel_linear_pca.csv",
        "C:/Users/YL/PycharmProjects/Python_Study_Note/Sklearn/FeatureUnion/no_pca.csv"]

linear = pd.read_csv(path[0])
kernel = pd.read_csv(path[1])
kernel_linear = pd.read_csv(path[2])
no_pca = pd.read_csv(path[3])

print("linear:", accuracy_score(linear.loc[:, "y"], linear.loc[:, "y^"].replace({0:1, 1:0})))
print("kernel:", accuracy_score(kernel.loc[:, "y"], kernel.loc[:, "y^"]))
print("kernel linear:", accuracy_score(kernel_linear.loc[:, "y"], kernel_linear.loc[:, "y^"].replace({0:1, 1:0})))
print("no_pca:", accuracy_score(no_pca.loc[:, "y"], no_pca.loc[:, "y^"].replace({2:0, 0:1, 1:2})))

linear: 0.96
kernel: 0.946666666667
kernel linear: 0.946666666667
no_pca: 0.96


## And So On