In [1]:
%matplotlib inline
#　只在Jupter notebook展示
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import hashlib
import os
# 指定字体避免画图中文显示乱码
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False # 使负号显示正常

In [2]:
housing_path = "datasets/housing"
csv_path = os.path.join(housing_path, "housing_train_stratified_sampling.csv")
housing_train_data = pd.read_csv(csv_path)
housting_data_copy = housing_train_data.copy()

In [3]:
housting_data_copy.head()

Unnamed: 0,经度,维度,房屋年龄中位数,总房间数,总卧室数,人口数,家庭数,收入中位数,房屋价值中位数,离大海距离
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,286600.0,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,340600.0,<1H OCEAN
2,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,196900.0,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,46300.0,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,254500.0,<1H OCEAN


In [4]:
housing = housting_data_copy.drop('房屋价值中位数',axis=1)
housing.head()

Unnamed: 0,经度,维度,房屋年龄中位数,总房间数,总卧室数,人口数,家庭数,收入中位数,离大海距离
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN
2,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN


In [5]:
housing_labels = housting_data_copy["房屋价值中位数"].copy()
housing_labels.head()

0    286600.0
1    340600.0
2    196900.0
3     46300.0
4    254500.0
Name: 房屋价值中位数, dtype: float64

### 数据清洗和处理

#### 处理缺失值 

In [6]:
# 使用平均值填补缺失值
median = housting_data_copy["总卧室数"].median()
median

433.0

In [9]:
housing_num = housting_data_copy.drop('离大海距离', axis=1)

In [17]:
# SKlearn提供的一个方便处理缺失值的类
from sklearn.preprocessing import Imputer
imputer = Imputer(median)

In [19]:
imputer.fit(housing_num)

Imputer(axis=0, copy=True, missing_values=433.0, strategy='mean', verbose=0)

In [20]:
imputer.statistics_

array([-1.19575834e+02,  3.56395773e+01,  2.86531008e+01,  2.62286094e+03,
                   nan,  1.41991036e+03,  4.97161410e+02,  3.87558937e+00,
        2.06990921e+05])

In [21]:
housing_num.median().values

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1195e+03,  4.3300e+02,
        1.1640e+03,  4.0800e+02,  3.5409e+00,  1.7950e+05])

In [22]:
X = imputer.transform(housing_num)

In [23]:
X

array([[-1.21890e+02,  3.72900e+01,  3.80000e+01, ...,  3.39000e+02,
         2.70420e+00,  2.86600e+05],
       [-1.21930e+02,  3.70500e+01,  1.40000e+01, ...,  1.13000e+02,
         6.42140e+00,  3.40600e+05],
       [-1.17200e+02,  3.27700e+01,  3.10000e+01, ...,  4.62000e+02,
         2.86210e+00,  1.96900e+05],
       ...,
       [-1.16400e+02,  3.40900e+01,  9.00000e+00, ...,  7.65000e+02,
         3.27230e+00,  9.78000e+04],
       [-1.18010e+02,  3.38200e+01,  3.10000e+01, ...,  3.56000e+02,
         4.06250e+00,  2.25900e+05],
       [-1.22450e+02,  3.77700e+01,  5.20000e+01, ...,  6.39000e+02,
         3.57500e+00,  5.00001e+05]])

###  处理文本和类别属性

In [24]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [25]:
housing_cat = housing["离大海距离"]

In [26]:
# 处理文本标签，该转换器只能用来转换标签
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

array([0, 0, 4, ..., 1, 0, 3])

In [27]:
print(encoder.classes_)

['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']


#### 使用独热编码处理文本

In [28]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
# fit_transform用于2D数组，housing_cat_encoded是1D数组，需要将其变形
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot # 稀疏矩阵

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [29]:
housing_cat_1hot.toarray() # 密集矩阵

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

#### 一步达到上面两步的结果(从文本分类到整数分类，再从整数分类到独热向量)

In [31]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot # 默认输出密集numpy数组，可以向LabelBinarizer()传递参数sparse_output=True得到稀疏矩阵

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

### 转换流水线（Pipeline）

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

NameError: name 'CombinedAttributesAdder' is not defined