# day1 数据预处理

1. import the required librares
2. import data set
3. handle missing data
4. encoding categorical data
5. split data into train and test data set
6. feature scale


主要内容：
1. sklearn.preprocessing

## 1. 导入库文件

In [1]:
import numpy as np
import pandas as pd

## 2. 倒入数据

In [2]:
dataset = pd.read_csv("../data/day1.csv")

In [3]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

## 3. 空值处理

In [12]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [9]:
Imputer?

In [11]:
imputer.transform?

## 4. 分类变量编码

In [17]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[: ,0] = labelencoder_X.fit_transform(X[:, 0])

### 创建哑变量

In [19]:
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [20]:
X

array([[  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.40000000e+01,
          7.20000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   2.70000000e+01,
          4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   3.00000000e+01,
          5.40000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   3.80000000e+01,
          6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   4.00000000e+01,
          6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   3.50000000e+01,
          5.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00

## 5. 样本分组(将样本分成测试集合训练集)

In [21]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 0)
X_train

array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   4.00000000e+01,
          6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   3.70000000e+01,
          6.70000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   2.70000000e+01,
          4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   3.87777778e+01,
          5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.80000000e+01,
          7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   3.80000000e+01,
          6.10000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00

In [22]:
X_test

array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   3.00000000e+01,
          5.40000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   5.00000000e+01,
          8.30000000e+04]])

## 6. 变量转换

> 变量标准化，等等

In [23]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [24]:
X_train

array([[-1.        ,  1.        , -1.        ,  2.64575131, -0.77459667,
         0.26306757,  0.12381479],
       [ 1.        , -1.        ,  1.        , -0.37796447, -0.77459667,
        -0.25350148,  0.46175632],
       [-1.        ,  1.        , -1.        , -0.37796447,  1.29099445,
        -1.97539832, -1.53093341],
       [-1.        ,  1.        , -1.        , -0.37796447,  1.29099445,
         0.05261351, -1.11141978],
       [ 1.        , -1.        ,  1.        , -0.37796447, -0.77459667,
         1.64058505,  1.7202972 ],
       [-1.        ,  1.        , -1.        , -0.37796447,  1.29099445,
        -0.0813118 , -0.16751412],
       [ 1.        , -1.        ,  1.        , -0.37796447, -0.77459667,
         0.95182631,  0.98614835],
       [ 1.        , -1.        ,  1.        , -0.37796447, -0.77459667,
        -0.59788085, -0.48214934]])

In [25]:
X_test

array([[ 0.,  0.,  0.,  0.,  0., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  1.]])