## 异常值处理
<img src='./image/5.3.png' width=400 div align=left />  
还可以填充、插值等。

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.DataFrame({'A':['a0','a1','a1','a2','a3','a4'],
                 'B':['b0','b1','b2','b2','b3',None],
                 'C':[1,2,None,3,4,5],
                 'D':[0.1,10.2,11.4,8.9,9.1,12],
                 'E':[10,19,32,25,8,None],
                 'F':['f0','f1','g2','f3','f4','f5']})
df

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [3]:
df.isnull()

Unnamed: 0,A,B,C,D,E,F
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,True,False,False,True,False


In [4]:
# 去掉空值行
df.dropna()

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4


In [6]:
# 去掉所有B列为空值的行
df.dropna(subset=['B'])

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4


In [7]:
# 重复值识别
df.duplicated(['A'])

0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool

In [8]:
# 如果填入两列，那么必须两列合起来都是重合的才算重合
df.duplicated(['A','B'])  

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [9]:
df.drop_duplicates(['A'])

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [10]:
df.drop_duplicates(['A'],keep='first')

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [11]:
df.drop_duplicates(['A'],keep=False)

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [13]:
df.fillna('b*')

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1,0.1,10,f0
1,a1,b1,2,10.2,19,f1
2,a1,b2,b*,11.4,32,g2
3,a2,b2,3,8.9,25,f3
4,a3,b3,4,9.1,8,f4
5,a4,b*,5,12.0,b*,f5


In [14]:
df['E'].fillna(df['E'].mean())

0    10.0
1    19.0
2    32.0
3    25.0
4     8.0
5    18.8
Name: E, dtype: float64

In [15]:
# 插值(只能作用于series)
df['E'].interpolate()

0    10.0
1    19.0
2    32.0
3    25.0
4     8.0
5     8.0
Name: E, dtype: float64

In [16]:
pd.Series([1,None,4,5,20]).interpolate()

0     1.0
1     2.5
2     4.0
3     5.0
4    20.0
dtype: float64

In [17]:
# 异常值
df['D']

0     0.1
1    10.2
2    11.4
3     8.9
4     9.1
5    12.0
Name: D, dtype: float64

In [18]:
upper_q=df['D'].quantile(0.75)
lower_q=df['D'].quantile(0.25)
q_int=upper_q-lower_q

In [19]:
k=1.5
df[df['D']>lower_q-k*q_int][df['D']<upper_q+k*q_int]

  from ipykernel import kernelapp as app


Unnamed: 0,A,B,C,D,E,F
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [20]:
df['F']  # 去掉g2

0    f0
1    f1
2    g2
3    f3
4    f4
5    f5
Name: F, dtype: object

In [22]:
df[[True if item.startswith('f') else False for item in list(df['F'].values)]]

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


## 特征预处理
### 特征选择
HR.csv数据要处理的问题：根据其他属性预测人员离职率，推断什么样的人容易离职。
<br />因此label（标注）为left。其他属性为特征（features）。

In [3]:
import scipy.stats as ss

In [5]:
df=pd.DataFrame({'A':ss.norm.rvs(size=10),
                 'B':ss.norm.rvs(size=10),
                 'C':ss.norm.rvs(size=10),
                 'D':np.random.randint(low=0,high=2,size=10)})

In [6]:
df

Unnamed: 0,A,B,C,D
0,-0.678018,0.936138,0.183552,1
1,-0.642499,0.798407,-0.128748,0
2,0.558729,0.452267,-1.20717,1
3,-0.812791,-1.120381,-0.199199,1
4,0.16022,0.228545,0.029413,1
5,-0.938964,-0.507716,-1.745953,0
6,-0.279468,0.102509,0.958818,1
7,0.844665,1.098862,-0.270638,1
8,1.093935,-2.137262,0.239498,0
9,-0.981487,0.601891,-0.707032,0


In [8]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [10]:
X=df.loc[:,['A','B','C']]
Y=df.loc[:,'D']

In [11]:
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel

其中SelectKBest为过滤常用的类，RFE为包裹常用的类，SelectFromModel为嵌入常用类。过滤、包裹、嵌入均为特征选择的方法。

In [12]:
skb=SelectKBest(k=2)
skb.fit(X,Y)

SelectKBest(k=2, score_func=<function f_classif at 0x000001F23BB2C620>)

In [14]:
X

Unnamed: 0,A,B,C
0,-0.678018,0.936138,0.183552
1,-0.642499,0.798407,-0.128748
2,0.558729,0.452267,-1.20717
3,-0.812791,-1.120381,-0.199199
4,0.16022,0.228545,0.029413
5,-0.938964,-0.507716,-1.745953
6,-0.279468,0.102509,0.958818
7,0.844665,1.098862,-0.270638
8,1.093935,-2.137262,0.239498
9,-0.981487,0.601891,-0.707032


In [13]:
skb.transform(X)

array([[ 0.93613824,  0.18355237],
       [ 0.79840717, -0.12874761],
       [ 0.45226706, -1.20716983],
       [-1.12038128, -0.19919915],
       [ 0.22854538,  0.02941294],
       [-0.50771621, -1.74595297],
       [ 0.10250949,  0.95881812],
       [ 1.09886185, -0.27063839],
       [-2.13726177,  0.23949766],
       [ 0.60189103, -0.70703238]])

发现features只保留了B和C列

In [15]:
# estimateor指定SVR线性回归器，step=1表示每迭代一次去掉1个特征。
rfe=RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)

In [17]:
X

Unnamed: 0,A,B,C
0,-0.678018,0.936138,0.183552
1,-0.642499,0.798407,-0.128748
2,0.558729,0.452267,-1.20717
3,-0.812791,-1.120381,-0.199199
4,0.16022,0.228545,0.029413
5,-0.938964,-0.507716,-1.745953
6,-0.279468,0.102509,0.958818
7,0.844665,1.098862,-0.270638
8,1.093935,-2.137262,0.239498
9,-0.981487,0.601891,-0.707032


In [16]:
rfe.fit_transform(X,Y)

array([[ 0.93613824,  0.18355237],
       [ 0.79840717, -0.12874761],
       [ 0.45226706, -1.20716983],
       [-1.12038128, -0.19919915],
       [ 0.22854538,  0.02941294],
       [-0.50771621, -1.74595297],
       [ 0.10250949,  0.95881812],
       [ 1.09886185, -0.27063839],
       [-2.13726177,  0.23949766],
       [ 0.60189103, -0.70703238]])

发现只保留了B和C列

In [18]:
# threshold=0.1表示重要性因子小于0.1就会被去掉
sfm=SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.1)

In [19]:
sfm.fit_transform(X,Y)

array([[-0.6780179 ,  0.93613824],
       [-0.64249904,  0.79840717],
       [ 0.55872901,  0.45226706],
       [-0.812791  , -1.12038128],
       [ 0.16021982,  0.22854538],
       [-0.9389639 , -0.50771621],
       [-0.27946761,  0.10250949],
       [ 0.84466483,  1.09886185],
       [ 1.09393509, -2.13726177],
       [-0.98148656,  0.60189103]])

发现只保留了A列和B列。

注：estimator= 已经选择了方法，但是在特征选择中，是通过取数据样本来选择更重要的特征，后续的建模中才是用estimator指定的方法对全部数据进行建模。

### 特征变换
（1）对指化
<img src='./image/5.6.png' width=500 />

（2）离散化
<img src='./image/5.7.png' width=500 />

In [20]:
lst=[6,8,10,15,16,24,25,40,67]

In [21]:
# 等深（等频）分箱
pd.qcut(lst,q=3)

[(5.999, 13.333], (5.999, 13.333], (5.999, 13.333], (13.333, 24.333], (13.333, 24.333], (13.333, 24.333], (24.333, 67.0], (24.333, 67.0], (24.333, 67.0]]
Categories (3, interval[float64]): [(5.999, 13.333] < (13.333, 24.333] < (24.333, 67.0]]

In [22]:
pd.qcut(lst,q=3,labels=['low','medium','high'])

[low, low, low, medium, medium, medium, high, high, high]
Categories (3, object): [low < medium < high]

In [23]:
# 等宽（等距）分箱
pd.cut(lst,bins=3)

[(5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (26.333, 46.667], (46.667, 67.0]]
Categories (3, interval[float64]): [(5.939, 26.333] < (26.333, 46.667] < (46.667, 67.0]]

In [24]:
pd.cut(lst,bins=3,labels=['low','medium','high'])

[low, low, low, low, low, low, low, medium, high]
Categories (3, object): [low < medium < high]

（3）归一化与标准化

In [25]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [27]:
np.array([1,4,10,15,21])

array([ 1,  4, 10, 15, 21])

In [28]:
np.array([1,4,10,15,21]).reshape(-1,1) 
# reshape(-1,1)中，-1表示不指定行数，1表示指定为1列。

array([[ 1],
       [ 4],
       [10],
       [15],
       [21]])

In [26]:
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1))



array([[ 0.  ],
       [ 0.15],
       [ 0.45],
       [ 0.7 ],
       [ 1.  ]])

In [29]:
StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1))



array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.]])

In [30]:
StandardScaler().fit_transform(np.array([1,0,0,0,0,0,0,0]).reshape(-1,1))



array([[ 2.64575131],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447]])

（4）数值化

In [31]:
from sklearn.preprocessing import  LabelEncoder,OneHotEncoder

In [32]:
LabelEncoder().fit_transform(np.array(['Down','Up','Up','Down']).reshape(-1,1))

  y = column_or_1d(y, warn=True)


array([0, 1, 1, 0], dtype=int64)

In [40]:
# one-hot 编码需要先labelEncoder
lb_encoder=LabelEncoder()
lb_tran_f=lb_encoder.fit_transform(np.array(['Red','Yellow','Blue','Green']).reshape(-1,1))

  y = column_or_1d(y, warn=True)


In [41]:
oht_encoder=OneHotEncoder().fit(lb_tran_f.reshape(-1,1))

In [44]:
oht_encoder.transform(lb_encoder.transform(np.array(['Yellow','Blue','Green','Green','Red']).reshape(-1,1)).reshape(-1,1))

  y = column_or_1d(y, warn=True)


<5x4 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [48]:
oht_encoder.transform(lb_encoder.transform(np.array(['Yellow','Blue','Green','Green','Red'])).reshape(-1,1)).toarray()

array([[ 0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.]])

（5）正规化
<img src='./image/5.10.png' width=500 />
默认是针对行来处理，找到影响最大的特征（列）

In [49]:
from sklearn.preprocessing import Normalizer

In [53]:
# 正规化是对行处理的
Normalizer(norm='l1').fit_transform(np.array([1,1,3,-1,2]).reshape(-1,1))

array([[ 1.],
       [ 1.],
       [ 1.],
       [-1.],
       [ 1.]])

In [56]:
Normalizer(norm='l1').fit_transform(np.array([[1,1,3,-1,2]]))

array([[ 0.125,  0.125,  0.375, -0.125,  0.25 ]])

In [57]:
Normalizer(norm='l2').fit_transform(np.array([[1,1,3,-1,2]]))

array([[ 0.25,  0.25,  0.75, -0.25,  0.5 ]])

## 特征降维
<img src='./image/5.11.png' width=500 />
<img src='./image/5.11_2.png' width=500 />
LDA思路：
<img src='./image/5.11_3.png' width=500 />
<img src='./image/5.11_4.png' width=500 />
<img src='./image/5.11_5.png' width=500 />
求出最优的$\omega$后正规化处理，将比较小的$\omega$对应的分量去掉，达到降维的目的。
如图，本来散点的两个特征为x,y，现在经过$\omega$变换成(x',y')，发现平行于分界线的y'坐标对分类意义不大，因此去掉，只保留x'一个特征。

In [58]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [59]:
X=np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
Y=np.array([1,1,1,2,2,2])

In [60]:
LinearDiscriminantAnalysis(n_components=1).fit_transform(X,Y)

array([[-1.73205081],
       [-1.73205081],
       [-3.46410162],
       [ 1.73205081],
       [ 1.73205081],
       [ 3.46410162]])

In [61]:
# LDA降维 当做判别器使用（Fisher判别器）
clf=LinearDiscriminantAnalysis(n_components=1).fit(X,Y)

In [63]:
clf.predict([[0.8,1]])

array([2])

【0.8,1】分类进Y=2的类别。

## 实战：对HR表特征处理

In [64]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [82]:
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
def hr_preprocessing(sl=False):
    df=pd.read_csv('./data/HR.csv')
    # 1、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 2、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 3、特征选择
    # 4、特征处理
    scaler_lst=[sl]
    column_lst=['satisfaction_level']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    return df

print(hr_preprocessing(sl=True))

       satisfaction_level  last_evaluation  number_project  \
0               -0.936495             0.53               2   
1                0.752814             0.86               5   
2               -2.022479             0.88               7   
3                0.431041             0.87               5   
4               -0.976716             0.52               2   
5               -0.815830             0.50               2   
6               -2.062701             0.77               6   
7                1.235474             0.85               5   
8                1.114809             1.00               5   
9               -0.775608             0.53               2   
10              -0.654943             0.54               2   
11              -2.022479             0.81               6   
12               0.913701             0.92               4   
13              -0.815830             0.55               2   
14              -1.016938             0.56               2   
15      

In [91]:
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False):
    df=pd.read_csv('./data/HR.csv')
    # 1、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 2、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 3、特征选择
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    return df

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0
    
print(hr_preprocessing(sl=True,le=True))

       satisfaction_level  last_evaluation  number_project  \
0               -0.936495        -1.087275             0.0   
1                0.752814         0.840707             0.6   
2               -2.022479         0.957554             1.0   
3                0.431041         0.899131             0.6   
4               -0.976716        -1.145699             0.0   
5               -0.815830        -1.262546             0.0   
6               -2.062701         0.314894             0.8   
7                1.235474         0.782283             0.6   
8                1.114809         1.658639             0.6   
9               -0.775608        -1.087275             0.0   
10              -0.654943        -1.028852             0.0   
11              -2.022479         0.548588             0.8   
12               0.913701         1.191249             0.4   
13              -0.815830        -0.970428             0.0   
14              -1.016938        -0.912004             0.0   
15      



接下来考虑一下需不需要降维。

In [92]:
from sklearn.decomposition import PCA

In [96]:
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0
    
print(hr_preprocessing(sl=True,le=True,ld_n=3))

(       satisfaction_level  last_evaluation  number_project  \
0               -0.936495        -1.087275             0.0   
1                0.752814         0.840707             0.6   
2               -2.022479         0.957554             1.0   
3                0.431041         0.899131             0.6   
4               -0.976716        -1.145699             0.0   
5               -0.815830        -1.262546             0.0   
6               -2.062701         0.314894             0.8   
7                1.235474         0.782283             0.6   
8                1.114809         1.658639             0.6   
9               -0.775608        -1.087275             0.0   
10              -0.654943        -1.028852             0.0   
11              -2.022479         0.548588             0.8   
12               0.913701         1.191249             0.4   
13              -0.815830        -0.970428             0.0   
14              -1.016938        -0.912004             0.0   
15     



In [75]:
df=pd.read_csv('./data/HR.csv')
df=df.dropna(subset=['satisfaction_level','last_evaluation'])
df=df[df['satisfaction_level']<=1][df['salary']!='nme']
MinMaxScaler().fit_transform(df['satisfaction_level'].values.reshape(-1,1))

array([[ 0.31868132],
       [ 0.78021978],
       [ 0.02197802],
       ..., 
       [ 0.30769231],
       [ 0.02197802],
       [ 0.30769231]])

In [81]:
df['satisfaction_level'].values.reshape(-1,1).reshape(1,-1)[0]

array([ 0.38,  0.8 ,  0.11, ...,  0.37,  0.11,  0.37])