In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
house = pd.read_csv("data/housing/housing.csv")

In [None]:
house.info()

In [None]:
house.ocean_proximity.value_counts()

In [None]:
house.describe()

In [None]:
house.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# 创建测试集
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[test_indices], data.iloc[train_indices]

In [None]:
def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1]<256*test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set=ids.apply(lambda _id: test_set_check(_id, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
house_with_idx = house.reset_index()

In [None]:
test_set, train_set = split_train_test_by_id(house_with_idx, 0.2, "index")
train_set

In [3]:
house["income_cut"] = np.ceil(house["median_income"]/1.5)
house["income_cut"].where(house.income_cut<5, 5.0, inplace=True)

In [5]:
# 分层抽样
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=41)

In [6]:
for train_index, test_index in split.split(house, house.income_cut):
    strat_train_set = house.loc[train_index]
    strat_test_set = house.loc[test_index]

In [7]:
strat_test_set.income_cut.value_counts()/len(strat_test_set)

3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cut, dtype: float64

In [8]:
strat_train_set.income_cut.value_counts()/len(strat_train_set)

3.0    0.350594
2.0    0.318859
4.0    0.176296
5.0    0.114402
1.0    0.039850
Name: income_cut, dtype: float64

In [9]:
strat_test_set.drop("income_cut", axis=1, inplace=True)
strat_train_set.drop("income_cut", axis=1, inplace=True)

In [None]:
strat_train_set.to_csv("./temp_data/housing.csv", index=False)

In [None]:
strat_train_set

In [10]:
housing = pd.read_csv("./temp_data/housing.csv")

In [None]:
housing

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.49)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.5,
            s=housing["population"]/100, label="population",
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.show()

In [None]:
housing.corr()

In [None]:
scatter_matrix(housing[["median_house_value", "median_income", "total_rooms", "housing_median_age"]], figsize=[21, 15])

In [None]:
housing.plot(kind="scatter", y="median_house_value", x="median_income")
plt.show()

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
housing

In [None]:
housing.corr()[["median_house_value"]].sort_values("median_house_value", ascending=False)

In [11]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [None]:
imputer = Imputer(strategy="median")

In [None]:
imputer.fit(housing_num)

In [None]:
X = imputer.transform(housing_num)

In [None]:
housting_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
housting_tr

In [None]:
encoder = LabelEncoder()

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat_encoded = encoder.fit_transform(housing_cat)

In [None]:
housing_cat_encoded

In [None]:
encoder.classes_

In [None]:
encoder=OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))

In [None]:
housing_cat_1hot.toarray()

In [None]:
encoder=LabelBinarizer()
housing_bin = encoder.fit_transform(housing_cat)
housing_bin

In [12]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self;
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
        population_per_household = X[:, population_ix]/X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)

In [None]:
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing_extra_attribs

##  转换流水线

In [13]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr

array([[ 0.76864785, -0.84900995, -0.68921602, ...,  0.08686294,
        -0.05081885, -0.59589883],
       [ 0.85858718, -0.94286039, -1.24472593, ...,  0.97763978,
        -0.03281407, -1.31536433],
       [-0.97017917,  1.37993786,  0.58052093, ..., -0.50741255,
        -0.0213332 ,  0.8634663 ],
       ...,
       [-1.47483874,  1.06553891, -1.00665025, ..., -0.21367022,
        -0.16283672,  0.33588826],
       [-1.1450612 ,  0.82152778, -1.08600881, ..., -0.16492238,
         0.05365635, -0.24709171],
       [ 0.75365796, -0.71761934,  0.73923805, ..., -0.05519075,
        -0.02969452, -0.49016635]])

In [14]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [15]:
from sklearn.base import TransformerMixin
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

In [16]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),
    ("label_binarizer", MyLabelBinarizer())
])
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [28]:
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [18]:
housing_prepared = full_pipeline.fit_transform(housing)

In [19]:
housing_prepared

array([[ 0.76864785, -0.84900995, -0.68921602, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.85858718, -0.94286039, -1.24472593, ...,  0.        ,
         0.        ,  0.        ],
       [-0.97017917,  1.37993786,  0.58052093, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.47483874,  1.06553891, -1.00665025, ...,  0.        ,
         1.        ,  0.        ],
       [-1.1450612 ,  0.82152778, -1.08600881, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.75365796, -0.71761934,  0.73923805, ...,  0.        ,
         0.        ,  0.        ]])

# 选择和训练模型

In [25]:
housing = housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.03,33.82,20.0,2662.0,464.0,1275.0,472.0,6.0162,318500.0,<1H OCEAN
1,-117.85,33.62,13.0,5192.0,658.0,1865.0,662.0,15.0001,500001.0,<1H OCEAN
2,-121.51,38.57,36.0,613.0,166.0,425.0,147.0,2.2031,93800.0,INLAND
3,-123.63,41.11,19.0,1797.0,384.0,1033.0,327.0,1.4911,59200.0,<1H OCEAN
4,-117.60,33.87,18.0,6450.0,1165.0,3716.0,1113.0,4.2721,150300.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
16507,-117.91,33.76,22.0,7531.0,1569.0,5254.0,1523.0,3.8506,167400.0,<1H OCEAN
16508,-121.97,37.31,25.0,5775.0,1225.0,3580.0,1138.0,3.9187,314900.0,<1H OCEAN
16509,-122.52,37.90,16.0,1704.0,402.0,689.0,348.0,4.4239,267100.0,NEAR BAY
16510,-121.86,37.38,15.0,2052.0,405.0,1380.0,409.0,5.8686,181100.0,<1H OCEAN


In [27]:
housing_prepared

array([[ 0.76864785, -0.84900995, -0.68921602, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.85858718, -0.94286039, -1.24472593, ...,  0.        ,
         0.        ,  0.        ],
       [-0.97017917,  1.37993786,  0.58052093, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.47483874,  1.06553891, -1.00665025, ...,  0.        ,
         1.        ,  0.        ],
       [-1.1450612 ,  0.82152778, -1.08600881, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.75365796, -0.71761934,  0.73923805, ...,  0.        ,
         0.        ,  0.        ]])

In [30]:
strat_train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
11632,-118.03,33.82,20.0,2662.0,464.0,1275.0,472.0,6.0162,318500.0,<1H OCEAN
10673,-117.85,33.62,13.0,5192.0,658.0,1865.0,662.0,15.0001,500001.0,<1H OCEAN
12532,-121.51,38.57,36.0,613.0,166.0,425.0,147.0,2.2031,93800.0,INLAND
2600,-123.63,41.11,19.0,1797.0,384.0,1033.0,327.0,1.4911,59200.0,<1H OCEAN
12074,-117.60,33.87,18.0,6450.0,1165.0,3716.0,1113.0,4.2721,150300.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
11349,-117.91,33.76,22.0,7531.0,1569.0,5254.0,1523.0,3.8506,167400.0,<1H OCEAN
17977,-121.97,37.31,25.0,5775.0,1225.0,3580.0,1138.0,3.9187,314900.0,<1H OCEAN
9396,-122.52,37.90,16.0,1704.0,402.0,689.0,348.0,4.4239,267100.0,NEAR BAY
17823,-121.86,37.38,15.0,2052.0,405.0,1380.0,409.0,5.8686,181100.0,<1H OCEAN


In [31]:
lin_reg = LinearRegression()
housing_labels = strat_train_set["median_house_value"].copy()
housing = strat_train_set.drop("median_house_value", axis=1)

lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[10:30]
some_labels = housing_labels.iloc[10:30]
some_data_prepared = full_pipeline.transform(some_data)
lin_reg.predict(some_data_prepared)

KeyError: "['median_house_value'] not in index"

In [24]:
list(some_labels)

[36600.0,
 289500.0,
 150600.0,
 122000.0,
 440900.0,
 139200.0,
 129400.0,
 175900.0,
 241200.0,
 286200.0,
 348300.0,
 500001.0,
 166300.0,
 157500.0,
 85700.0,
 213700.0,
 198400.0,
 210900.0,
 132700.0,
 207500.0]

In [22]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

3.089202769303165e-10