###### 异常值（空值）处理

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
import seaborn as sns

In [23]:
d = {'A':['a0','a1','a1'',a2','a3','a4'],'B':['b0','b1','b2','b2','b3','b4',None]}
k=list(d.keys())
v=list(d.values())
df=pd.DataFrame(list(zip(k,v)),columns=['k','v'])
df

Unnamed: 0,k,v
0,A,"[a0, a1, a1,a2, a3, a4]"
1,B,"[b0, b1, b2, b2, b3, b4, None]"


In [39]:
d = {'A':['a0','a1','a1','a2','a3','a4'],'B':['b0','b1','b2','b2','b3',None],
     'C':[1,2,None,3,4,5],'D':[0.1,10.2,11.4,8.9,9.1,12],'E':[10,19,32,25,8,None],
    'F':['f0','f1','g2','f3','f4','f5']}
df = pd.DataFrame(list(d.values())).T
df

Unnamed: 0,0,1,2,3,4,5
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [40]:
df.isnull()

Unnamed: 0,0,1,2,3,4,5
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,True,False,False,True,False


In [48]:
df.dropna(subset=[2])

Unnamed: 0,0,1,2,3,4,5
0,a0,b0,1,0.1,10.0,f0
1,a1,b1,2,10.2,19.0,f1
3,a2,b2,3,8.9,25.0,f3
4,a3,b3,4,9.1,8.0,f4
5,a4,,5,12.0,,f5


In [51]:
df.duplicated([1])#识别一列的重复值

0    False
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [53]:
df.drop_duplicates([1])

Unnamed: 0,0,1,2,3,4,5
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [64]:
df[4] = df[4].astype("float")
df[4].interpolate() #取前后的平均数，如果是开头结尾则直接和前后数字相同

0    10.0
1    19.0
2    32.0
3    25.0
4     8.0
5     8.0
Name: 4, dtype: float64

In [65]:
df[4].interpolate(method='spline',order=3)
#可以选择多种方法插值，这里使用的是三次样条插值

0    10.000000
1    19.000000
2    32.000000
3    25.000000
4     8.000000
5   -20.143603
Name: 4, dtype: float64

In [67]:
df[3] = df[3].astype("float")
upper_q=df[3].quantile(0.75)
lower_q=df[3].quantile(0.25)#选取上下侧分位数
q_int=upper_q-lower_q
k=1.5
df[df[3]>lower_q-k*q_int][df[3]<upper_q+k*q_int]

  


Unnamed: 0,0,1,2,3,4,5
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [68]:
df[[True if item.startswith('f') else False for item in list(df[5].values)]]
#处理掉在单列中首字母不一致的数组

Unnamed: 0,0,1,2,3,4,5
0,a0,b0,1,0.1,10.0,f0
1,a1,b1,2,10.2,19.0,f1
3,a2,b2,3,8.9,25.0,f3
4,a3,b3,4,9.1,8.0,f4
5,a4,,5,12.0,,f5


##### 特征预处理

In [71]:
df=pd.DataFrame({'A':ss.norm.rvs(size=10),'B':ss.norm.rvs(size=10),'C':ss.norm.rvs(size=10),
                'D':np.random.randint(low=0,high=2,size=10)})
df

Unnamed: 0,A,B,C,D
0,-1.129367,-1.05349,0.942974,0
1,0.318305,0.53412,-2.21058,1
2,-0.096264,1.117052,-0.30976,0
3,-0.313302,0.61189,-0.102147,1
4,1.525268,-1.578666,-1.791579,1
5,-1.382642,1.140805,0.196972,0
6,-0.438494,-0.727005,-0.649082,1
7,-0.498674,0.649548,-0.779882,0
8,-0.322507,-0.151418,-2.153197,0
9,0.167339,-0.879029,-0.130023,0


In [73]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [76]:
X = df.loc[:,['A','B','C']]#特征
Y = df.loc[:,'D']#标注

In [78]:
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
#引入用于数据归约的方法，过滤思想，包裹思想，嵌入思想

###### 特征选择

In [79]:
skb = SelectKBest(k=2)
skb.fit(X,Y)

SelectKBest(k=2, score_func=<function f_classif at 0x000002256AC4F378>)

In [80]:
skb.transform(X)#可见保留AB，C的值对于最后的D结果不显著，所以被弱化去除

array([[-1.1293672 ,  0.94297356],
       [ 0.3183049 , -2.21058026],
       [-0.09626359, -0.30976025],
       [-0.31330238, -0.10214697],
       [ 1.5252678 , -1.79157946],
       [-1.38264223,  0.19697189],
       [-0.43849364, -0.64908231],
       [-0.49867361, -0.77988219],
       [-0.32250672, -2.15319743],
       [ 0.1673393 , -0.13002303]])

In [82]:
rfe = RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)
#选择留下2个最重要特征向量，每次迭代删除一个特征值
rfe.fit_transform(X,Y)

array([[-1.05348959,  0.94297356],
       [ 0.53411995, -2.21058026],
       [ 1.11705247, -0.30976025],
       [ 0.61188972, -0.10214697],
       [-1.57866606, -1.79157946],
       [ 1.14080489,  0.19697189],
       [-0.72700481, -0.64908231],
       [ 0.64954832, -0.77988219],
       [-0.15141777, -2.15319743],
       [-0.87902948, -0.13002303]])

In [85]:
sfm = SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.1)
#低于0.1的特征量就会被舍弃 选择此模型，必须要有feature_importance or coef_
sfm.fit_transform(X,Y)

array([[-1.1293672 , -1.05348959,  0.94297356],
       [ 0.3183049 ,  0.53411995, -2.21058026],
       [-0.09626359,  1.11705247, -0.30976025],
       [-0.31330238,  0.61188972, -0.10214697],
       [ 1.5252678 , -1.57866606, -1.79157946],
       [-1.38264223,  1.14080489,  0.19697189],
       [-0.43849364, -0.72700481, -0.64908231],
       [-0.49867361,  0.64954832, -0.77988219],
       [-0.32250672, -0.15141777, -2.15319743],
       [ 0.1673393 , -0.87902948, -0.13002303]])

###### 特征变换

In [89]:
lst=[6,8,10,15,16,24,25,40,67]
pd.qcut(lst,q=3,labels=['low','medium','high'])#深度

[low, low, low, medium, medium, medium, high, high, high]
Categories (3, object): [low < medium < high]

In [90]:
pd.cut(lst,bins=3,labels=['low','medium','high'])#宽度

[low, low, low, low, low, low, low, medium, high]
Categories (3, object): [low < medium < high]

###### 归一化与标准化

In [91]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [93]:
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1)).T #被放在0-1之间



array([[0.  , 0.15, 0.45, 0.7 , 1.  ]])

In [98]:
StandardScaler().fit_transform(np.array([1,0,0,0,0]).reshape(-1,1)).T 
#可见突出的值



array([[ 2. , -0.5, -0.5, -0.5, -0.5]])

###### 数值化

In [101]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
lb_encoder = LabelEncoder()#标签化
lb_encoder.fit_transform(np.array(['l','l','m','h','m']).reshape(-1,1))

  y = column_or_1d(y, warn=True)


array([1, 1, 2, 0, 2], dtype=int64)

In [107]:
lb_encoder = LabelEncoder()#独热（定类）
lb_tran_f = lb_encoder.fit_transform(np.array(['red','yellow','blue','green']))
oht_encoder = OneHotEncoder().fit(lb_tran_f.reshape(-1,1))
oht_encoder.transform(lb_encoder.transform(np.array(['red','yellow','blue','green'])).reshape(-1,1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]])

###### 正规化

In [113]:
from sklearn.preprocessing import Normalizer
Normalizer(norm="l2").fit_transform(np.array([[1,1,3,-1,2]]))
#norm : ‘l1’, ‘l2’, or ‘max’, optional (‘l2’ by default) 欧式范数|x|1/2

array([[ 0.25,  0.25,  0.75, -0.25,  0.5 ]])

###### 特征降维

In [131]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
Y = np.array([1,1,1,2,2,2])
clf = LinearDiscriminantAnalysis()
clf.fit(X, Y)  
LinearDiscriminantAnalysis(n_components=1,solver='svd',tol=0.0001)#可以是pca或svd
print(clf.fit_transform(X,Y))
print(clf.predict([[0.8, 1]]))# Fisher classifier

[[-1.73205081]
 [-1.73205081]
 [-3.46410162]
 [ 1.73205081]
 [ 1.73205081]
 [ 3.46410162]]
[[-1.73205081]
 [-1.73205081]
 [-3.46410162]
 [ 1.73205081]
 [ 1.73205081]
 [ 3.46410162]]
[2]
