#### KNN填充

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame

In [2]:
data = pd.DataFrame(columns=list('ABC'), data=[
    [1, 2, 3],
    [4, 5, 6],
    [np.nan, 8, 9],
    [10, 11, np.nan],
    [13, np.nan, 15],
    [16, 17, 18]
])
data

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,,8.0,9.0
3,10.0,11.0,
4,13.0,,15.0
5,16.0,17.0,18.0


In [3]:
from sklearn.impute import KNNImputer

In [4]:
knn = KNNImputer(n_neighbors=3)
knn.fit(data)
target = knn.transform(data) # 矩阵
target = pd.DataFrame(target, columns=data.columns)
target

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,5.0,8.0,9.0
3,10.0,11.0,10.0
4,13.0,12.0,15.0
5,16.0,17.0,18.0


#### 插值预测
- DataFrame.interpolate(
-        method, # 插值方法，可指定一次，二次，三次和时间插值
-        inplace # 同上
-        )
- interpolate函数用于插值，默认为线性插值
- 可以指定：time为时间插值，quadratic为二次插值，cubic为三次插值
- 同样，如果要直接修改原数据，指定inplace为Ture

In [5]:
data.interpolate(method='linear')

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,7.0,8.0,9.0
3,10.0,11.0,12.0
4,13.0,14.0,15.0
5,16.0,17.0,18.0


#### 模型填充
- 通过建立一个线性模型来预测

In [6]:
data = pd.DataFrame(columns=list('ABC'), data=[
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [10, 11, np.nan],
    [13, 14, 15],
    [16, 17, np.nan]
])
data

Unnamed: 0,A,B,C
0,1,2,3.0
1,4,5,6.0
2,7,8,9.0
3,10,11,
4,13,14,15.0
5,16,17,


In [7]:
train = data[data['C'].notnull()].copy()
train

Unnamed: 0,A,B,C
0,1,2,3.0
1,4,5,6.0
2,7,8,9.0
4,13,14,15.0


In [8]:
test = data[data['C'].isnull()].copy()
test

Unnamed: 0,A,B,C
3,10,11,
5,16,17,


In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
# A, B作为训练数据， C作为要预测的目标值
lr = LinearRegression()
lr.fit(train[['A', 'B']], train['C'])
# 预测包含空值的列
# 可以看到，预测的空值分别为12和18
lr.predict(test[['A', 'B']])

array([12., 18.])

In [11]:
test['C'] = lr.predict(test[['A', 'B']])
data.update(test)
data

Unnamed: 0,A,B,C
0,1,2,3.0
1,4,5,6.0
2,7,8,9.0
3,10,11,12.0
4,13,14,15.0
5,16,17,18.0


- 变量映射

In [13]:
data = pd.DataFrame(columns=['性别'], data=
    ['男', '男', '男', '女', '女', np.nan]
)
data

Unnamed: 0,性别
0,男
1,男
2,男
3,女
4,女
5,


In [14]:
pd.get_dummies(data, columns=['性别'])

Unnamed: 0,性别_女,性别_男
0,False,True
1,False,True
2,False,True
3,True,False
4,True,False
5,False,False


In [15]:
pd.get_dummies(data, columns=['性别'], dummy_na=True)

Unnamed: 0,性别_女,性别_男,性别_nan
0,False,True,False
1,False,True,False
2,False,True,False
3,True,False,False
4,True,False,False
5,False,False,True
