In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

df_housing = pd.read_csv("../datasets/housing.csv")
#df_housing.head()
#df_housing.info()
#df_housing["ocean_proximity"].value_counts()
#df_housing.describe()
#df_housing.hist(bins=50, figsize=(20,15))
#plt.show()

df_housing["income_cat"] = np.ceil(df_housing["median_income"] / 1.5)
df_housing["income_cat"].where(df_housing["income_cat"] < 5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df_housing, df_housing["income_cat"]):
    #print(train_index, test_index)
    strat_train_set = df_housing.iloc[train_index]
    strat_test_set = df_housing.iloc[test_index]
#print(df_housing["income_cat"].value_counts() / len(df_housing))
#print(strat_train_set["income_cat"].value_counts() / len(strat_train_set))
for set_ in [strat_train_set, strat_test_set]:
    set_.drop("income_cat", axis=1, inplace=True)

#train_set, test_set = split_train_test(df_housing, 0.2)
#print(train_set["income_cat"].value_counts() / len(train_set))

housing = strat_train_set.copy()
housing.plot(kind="scatter", figsize=(10,7), x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, c=housing["median_house_value"], cmap=plt.get_cmap("jet"), colorbar=False, label="population")
plt.legend()
plt.savefig("../plots/ex_2_01.pdf")

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

scatter_matrix(housing[["median_house_value","median_income","total_rooms","housing_median_age"]], figsize=(12,8))
plt.savefig("../plots/ex_2_02.pdf")

housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
plt.savefig("../plots/ex_2_03.pdf")

housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
#imputer.statistics_
#housing_num.median().values
X = imputer.transform(housing_num)

housing_cat = housing["ocean_proximity"]
housing_cat_encoded, housing_categories = housing_cat.factorize()

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

num_pipeline = Pipeline([
    ("imputer", Imputer(strategy="median")),
    ("std_scaler", StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
