In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import CategoricalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

# read csv
housing = pd.read_csv("../datasets/housing.csv")

# stratify the sample
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# create train and test sets
for train_index, test_index in split.split(housing, housing["income_cat"]):
    train_set = housing.iloc[train_index]
    test_set = housing.iloc[test_index]
for set_ in [train_set, test_set]:
    set_.drop("income_cat", axis=1, inplace=True)
    
# separate feature and label for training
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

# class: add attributes
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, roots_per_household, population_per_household]
        
# class: convert dataframe to numpy array
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
# class: convert category to one binary attribute
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoding):
        self.encoding = encoding
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        df_X = pd.DataFrame({"A":X.ravel()})
        X_encoded = pd.factorize(df_X["A"])[0]
        encoder = OneHotEncoder()
        if self.encoding == "onehot-dense":
            return encoder.fit_transform(X_encoded.reshape(-1,1)).toarray()
        else:
            return encoder.fit_transform(X_encoded.reshape(-1,1))

# pipeline for numerical attributes
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),
    ("imputer", Imputer(strategy="median")),
    ("attribs_added", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler())
])

# pipeline for category attributes
cat_attribs = ["ocean_proximity"]
cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),
    ("cat_encoder", CategoricalEncoder(encoding="onehot-dense")),
])

# combine pipelines
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

# final prepare
housing_prepared = full_pipeline.fit_transform(housing)

# linear regression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels, housing_predictions))

# decision tree
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_rmse = np.sqrt(mean_squared_error(housing_labels, housing_predictions))

# random forest
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_rmse = np.sqrt(mean_squared_error(housing_labels, housing_predictions))

# cross-validation
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, 
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, 
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

# display scores
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
print(lin_rmse)
display_scores(tree_rmse_scores)
print(tree_rmse)
display_scores(lin_rmse_scores)
print(forest_rmse)
display_scores(forest_rmse_scores)

# save the parameters
#joblib.dump(lin_rmse, "ex_2_lin.pkl")
#lin_loaded = joblib.load("ex_2_lin.pkl")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


68632.1192054
Scores: [ 69281.35862515  66516.81299445  69172.46307431  70184.37613155
  70513.06112787  74426.90846472  70803.32072523  70667.93156008
  75953.9879416   70360.64942284]
Mean: 70788.0870068
Standard deviation: 2523.67361613
0.0
Scores: [ 66813.56315572  66960.118071    70351.64600304  74739.57052552
  68018.93333434  71193.84183426  64969.63056405  68278.06137217
  71553.60360967  67665.09747822]
Mean: 69054.4065948
Standard deviation: 2729.92680166
21971.1725475
Scores: [ 52197.72016739  49001.29976964  52904.20147068  54286.35970913
  52512.07048442  57195.16123476  51676.67997974  50474.94107832
  54672.09217992  52375.4921816 ]
Mean: 52729.6018256
Standard deviation: 2161.39901682


['ex_2_lin.pkl']