1. 导入库

In [36]:
import numpy as np
import pandas as pd

2. 导入数据集

In [37]:
df = pd.read_csv('./data/data.csv')

## 提取特征(X)和标签(y)
X = df.iloc[:, :-1].values   ## 返回一个（所有X）除了最后一列（y标签）的一个numpy数组
Y = df.iloc[:, -1].values    ## 返回最后一列（y标签）的一个numpy数组

df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


3. 处理丢失数据

In [None]:
## 这里要注意，原资料中的"from sklearn.preprocessing import Imputer"中，Imputer库已被弃用，因此改为SimpleImputer
from sklearn.impute import SimpleImputer  ## 导入处理缺失值的SimpleImputer库
imputer = SimpleImputer(strategy = "mean") ## 创建imputer实例，选择以均值填充
X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) ## 调用fit_transform方法，对X的第1/2列进行填充
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


4. 解析分类数据

In [39]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## 对X的第1列(国家这个类别变量)设置类别编码
labelencoder_x = LabelEncoder() ## 创建一个LabelEncoder实例
X[:, 0] = labelencoder_x.fit_transform(X[:, 0])

## 对X的第1列(国家这个类别变量)进行onehot-encoding
onehotencoder = OneHotEncoder()  ## 创建一个OneHotEncoder实例
X_encoded = onehotencoder.fit_transform(X[:, [0]])  ## 对第1列进行onehot编码处理
X = np.hstack([X[:, 1:], X_encoded.toarray()])

## 对Y这个分类结局变量设置类别编码
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y)

5. 拆分数据集

In [40]:
from sklearn.model_selection import train_test_split
## 划分80%训练集，20%测试集，random_state表示随机种子为0
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0) 

6. 特征量化

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()  ## 创建一个StandardScaler实例

## 对X_train和X_test进行标准化
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)