In [23]:
import numpy as np
import matplotlib as mpl
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier,RandomTreesEmbedding
from sklearn.decomposition import PCA

In [24]:
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False

In [25]:
# 设置在jupyter中matplotlib的显示情况
%matplotlib tk

In [26]:
path = "car.data"
data = pd.read_csv(path, header=None)

In [27]:
### 原始数据
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [28]:
for i in range(7):
    print(i, np.unique(data[i]))

0 ['high' 'low' 'med' 'vhigh']
1 ['high' 'low' 'med' 'vhigh']
2 ['2' '3' '4' '5more']
3 ['2' '4' 'more']
4 ['big' 'med' 'small']
5 ['high' 'low' 'med']
6 ['acc' 'good' 'unacc' 'vgood']


In [29]:
### 字符串转换为序列id（数字）
X = data[list(range(6))].apply(lambda x: pd.Categorical(x).codes)
Y = data[6]
# Y.value_counts()
Y_categorical = pd.Categorical(Y)
Y = Y_categorical.codes

In [30]:
print(Y[:10])
X.head(2)

[2 2 2 2 2 2 2 2 2 2]


Unnamed: 0,0,1,2,3,4,5
0,3,3,0,0,2,1
1,3,3,0,0,2,2


In [31]:
### 进行哑编码操作
enc = OneHotEncoder()
X = enc.fit_transform(X)
print(enc.n_values_)

[4 4 4 3 3 3]


In [32]:
### 转换后数据
df2 = pd.DataFrame(X.toarray())
df2.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [33]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 21 columns):
0     1728 non-null float64
1     1728 non-null float64
2     1728 non-null float64
3     1728 non-null float64
4     1728 non-null float64
5     1728 non-null float64
6     1728 non-null float64
7     1728 non-null float64
8     1728 non-null float64
9     1728 non-null float64
10    1728 non-null float64
11    1728 non-null float64
12    1728 non-null float64
13    1728 non-null float64
14    1728 non-null float64
15    1728 non-null float64
16    1728 non-null float64
17    1728 non-null float64
18    1728 non-null float64
19    1728 non-null float64
20    1728 non-null float64
dtypes: float64(21)
memory usage: 283.6 KB


In [34]:
X = X.toarray()

In [35]:
# 1. 使用随机森林实现扩展
rte = RandomTreesEmbedding(n_estimators=100, max_depth=7)
enc = OneHotEncoder()
rte.fit(X)
enc.fit(rte.apply(X))
X = enc.transform(rte.apply(X))

In [39]:
np.shape(X)

(1728, 12700)

In [15]:
# 2. 做一个KBest的特征选择
sk = SelectKBest(chi2, k=5000)
sk.fit(X, Y)
X = sk.transform(X)

In [16]:
np.shape(X)

(1728, 5000)

In [42]:
# 3. 使用SelectFromModel+GBDT做特征选择
sfm = SelectFromModel(GradientBoostingClassifier(), threshold=0.000001)
sfm.fit(X, Y)
X = sfm.transform(X)

In [43]:
np.shape(X)

(1728, 874)

In [19]:
# 4. 做PCA降维
X = X.toarray()
pca = PCA(n_components=100)
pca.fit(X)
X = pca.transform(X)

In [20]:
pd.DataFrame(X).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.150607,0.002638,-0.03009,0.439218,-0.161143,-0.022285,-0.184742,-0.588047,0.072815,-0.114582,...,-0.102403,-0.124766,-0.045336,-0.035005,0.081434,0.000653,0.302419,0.009855,-0.057261,0.018714
1,-0.377087,0.017868,-0.074897,0.763909,-0.083085,0.024057,-0.20476,-0.418201,-0.019547,0.2374,...,0.249814,-0.085547,0.223135,0.315133,0.016173,-0.107058,0.053611,0.109241,0.33564,0.069532
2,0.336511,-0.047612,-0.16098,0.839839,-0.411284,0.227246,-0.383982,-0.317136,0.098322,0.253336,...,0.092244,0.100825,-0.242471,-0.344868,0.106186,0.173086,0.187702,0.362565,-0.050099,-0.007632
3,-0.184976,-0.230141,0.087474,0.311627,-0.248707,0.00112,-0.445984,-0.596026,0.231876,-0.088655,...,0.06285,0.095529,-0.305709,-0.005652,0.004877,-0.048576,0.104552,0.017026,0.207919,0.04754
4,-0.413767,-0.244062,0.139471,0.101171,-0.1566,0.070104,-0.617955,-0.509945,0.279583,0.145779,...,0.294468,0.009144,-0.380667,0.153808,-0.067713,-0.278032,0.057977,-0.048459,0.180371,0.040718
