In [42]:
import pandas as pd
import numpy as np
import os

def load_housing_data(housing_path='./'):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()

import hashlib
import numpy as np

def identifier_test_number(identifier, test_ratio):
    return hashlib.sha256(str(identifier).encode("utf-8")).digest()[-1] < 255*test_ratio

def create_train_and_test(dataset, index_column, test_ratio):
    ids = dataset[index_column]
    test_indice = ids.apply(lambda x:identifier_test_number(x,test_ratio))
    return dataset.loc[~test_indice], dataset.loc[test_indice]
    
housing_with_id = housing.reset_index()
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"] # This value is much stable
train_set, test_set = create_train_and_test(housing_with_id,'id',0.2)

# Knowing the data
# housing['ocean_proximity'].value_counts()
# housing.describe()
# import matplotlib.pyplot as plt
# %matplotlib inline
# housing.hist(bins=50, figsize=(20,15))
# plt.show()

In [43]:
# Split 20% of test data with classification requrement.
import numpy as np
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # Reduce the classes and make it a integer
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) # 5.0 Max

from sklearn.model_selection import StratifiedShuffleSplit # Stratified=layered
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # n_splits How meny, 
for train_index, test_index in split.split(housing, housing['income_cat']):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]

# housing['income_cat'].value_counts()/len(housing)
# 3.0    0.350581
# 2.0    0.318847
# 4.0    0.176308
# 5.0    0.114438
# 1.0    0.039826
# test_set['income_cat'].value_counts()/len(test_set)
# 3.0    0.350533
# 2.0    0.318798
# 4.0    0.176357
# 5.0    0.114583
# 1.0    0.039729

# housing.plot(kind="scatter",x='longitude', y='latitude', alpha='0.1') # 
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
#          s=housing["population"]/100, label="population",
#          c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
#      )
# plt.legend()

# corr_matrix = housing.corr()
# corr_matrix["median_house_value"].sort_values(ascending=False)
# attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
# from pandas.tools.plotting import scatter_matrix
# scatter_matrix(housing[attributes],figsize=(12,8))

# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

# housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
# housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
# housing["population_per_household"]=housing["population"]/housing["households"]
# corr_matrix = housing.corr()
# corr_matrix["median_house_value"].sort_values(ascending=False)

In [44]:
housing = train_set.drop("median_house_value", axis=1) # X DataFrame
housing_labels = train_set["median_house_value"].copy() # y Series

# Put median value to the emplty fields


In [45]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# Only Work for numerics
housing_num = housing.drop("ocean_proximity",axis=1) 
# find the median Value
imputer.fit(housing_num) 
# Predicted new value 
X = imputer.transform(housing_num)  # np.ndarray


- Estimators: Estimate something based on a dataset. Perform `fit()`, takes a dataset as a para. Any other parameter needed to guide the estimation process is considered a hyperparameter (such as an imputer’s strategy), and it must be set as an instance variable (generally via a constructor parameter).
- Transformers: transform data with `transform()` method or `fit_transform()` which might optimized.
- Predicators: `predict()` method to predict, `score()` method to measure the quality.

In [46]:
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# housing_cat = housing["ocean_proximity"]
# housing_cat_encoded = encoder.fit_transform(housing_cat)
# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder()
# housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

There are two common ways to get all attributes to have the same scale: 
- min-max scaling. Min-max scaling (many people call this normalization) is quite simple: values are shifted and rescaled so that they end up ranging from 0 to 1. `MinMaxScaler`, `feature_range` hyperparameter changes the range
- standardization. first it subtracts the mean value (so standardized values always have a zero mean). and then it divides by the variance so that the resulting distribution has unit variance. standardization does not bound values to a specific range. However, standardization is much less affected by outliers. `StandardScaler`. 

In [47]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        room_per_house = X[:, rooms_ix] / X[:, household_ix]
        population_per_house = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, room_per_house, population_per_house, bedrooms_per_room]
        else:
            return np.c_[X, room_per_house, population_per_house]

# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)

class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder', CombinedAttributesAdder()),
                         ('std_scaler', StandardScaler())])
housing_num_str = num_pipeline.fit_transform(housing_num)


from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelBinarizer
from sklearn_features.transformers import DataFrameSelector

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                         ('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder', CombinedAttributesAdder()),
                         ('std_scaler', StandardScaler())])
cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),
                         ('label_binarizer', CustomLabelBinarizer())])

full_pipeline = FeatureUnion(transformer_list=[('num_pipeline',num_pipeline),
                                               ('cat_pipeline',cat_pipeline)])

housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)

(16512, 17)


In [49]:
# Begin To Train
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)

some_labels = housing_labels.iloc[:5]

print("Labels:\t\t", list(some_labels))
print("Predictions:\t", lin_reg.predict(some_data_prepared))


Labels:		 [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


ValueError: shapes (5,15) and (17,) not aligned: 15 (dim 1) != 17 (dim 0)