In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix 
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin


housing = pd.read_csv('housing.csv', header=0, sep=',')

#housing.hist(bins=50, figsize=(20,15))
#plt.show()

train_set, test_set = train_test_split(housing, test_size=0.2, random_state = 2)
train_set.shape
train_set.head()

In [None]:
data = train_set.copy()
data.plot(kind="scatter", x="longitude", y="latitude",
          s=data["population"]/30, label="population",
          c=data["median_house_value"], cmap=plt.get_cmap("jet"),
          figsize=(10,7), alpha=0.2)
data.shape
        

In [None]:
corr_matrix = data.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
features = ["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(data[features], figsize=(15,10))
plt.show()


In [None]:
data = train_set.copy()
data.plot(kind="scatter", x="median_income", y="median_house_value",
figsize=(10,7), alpha=0.4)
          
data.shape
         

In [None]:
data["total_rooms_per_households"] = data["total_rooms"]/data["households"]
data["total_bedrooms_per_total_rooms"] = data["total_bedrooms"]/data["total_rooms"]
data["population_per_households"] = data["population"]/data["households"]
data.head(10)

In [None]:
#========================== simpleimputer ==================

df = train_set.copy()
df_label = df["median_house_value"].copy()
df = df.drop("median_house_value", axis = 1)

df_num = df.drop("ocean_proximity", axis=1)

#=====missing data (1.drop_rows, 2.drop_columns, 3.replace)
#df_num = df_num.dropna(subset=["total_bedrooms"])   #option 1
#df_num.drop("total_bedrooms", axis=1)               #option 2
#median = df_num("total_bedrooms").median()
#df_num["total_bedrooms"].fillna(median)             #option 3

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(df_num)
X = imputer.transform(df_num)
df_num_impute_tr = pd.DataFrame(X, columns=df_num.columns)
df_num_impute_tr.info()
df_num.info()
df_num_impute_tr.head()

#=========== Custom Transform===============

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        def transform(self, X, y=None):
            rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
            population_per_household = X[:, population_ix]/X[:, household_ix]
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    
custom = CombinedAttributesAdder()
data_custom_tr_tmp = custom.transform(df_num_impute_tr.values)
data_custom_tr = pd.DataFrame(data_custom_tr_tmp)
columns = list(df_num_impute_tr.columns)
columns.append("rooms_per_household")
columns.append("population_per_household")
columns.append("bedrooms_per_room") 
data_custom_tr.columns = columns
data_custom_tr.head(10)

#============ Feature Scaling==============
from sklearn.preprocessing import StandardScaler

feature_scal = StandardScaler()
data_num_scaled_tr = pd.DataFrame(feature_scal.fit_transform(data_custom_tr.values), columns=data_custom_tr.columns)
data_num_scaled_tr.head()

#================================================= Label Encoder ================================================

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data_cat=df["ocean_proximity"]
data_cat_encoded = encoder.fit_transform(data_cat)
data_cat_encoded= pd.DataFrame(data_cat_encoded, columns=["ocean_proximity"])
data_cat_encoded.head()

#====================== OneHotEncoder =====================================================

from  sklearn.preprocessing import OneHotEncoder

encoder_1hot = OneHotEncoder( sparse=False)
data_cat_1hot_tmp = encoder_1hot.fit_transform(df[["ocean_proximity"]])
data_cat_1hot = pd.DataFrame(data_cat_1hot_tmp)
data_cat_1hot.columns = encoder_1hot.get_feature_names(['prox'])
data_cat_1hot.head()

final = pd.concat([data_num_scaled_tr, data_cat_1hot], axis=1)
final.head(10)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        def transform(self, X, y=None):
            rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
            population_per_household = X[:, population_ix] / X[:, household_ix]
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
df= train_set.copy()
df_label = ["median_house_value"].copy
df = df.drop("median_house_value", axis=1)

df_num = df.drop(["ocean_proximity"], axis=1)
num_attrs = list(df_num)
cat_attrs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrs)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrs)),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(df)
housing_prepared_df = pd.DataFrame(housing_prepared, columns=['longitude','latitude','housing_median_age','total_rooms',
                                                              'total_bedrooms',
                                                              'population','households','median_income','rooms_per_household',
                                                              'population_per_household','bedrooms_per_room','prox_<1H OCEAN',
                                                              'prox_INLAND','prox_ISLAND','prox_NEAR BAY','prox_NEAR OCEAN'])
housing_prepared_df.head()

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_pre)

sample_data_prepared = housing_prepared_df.iloc[:4]
print("predition:\t", lin_reg.predict(sample_data_prepared))

sample_labels = df_label.iloc[:4]
print("Labels:\t\t", list(sample_labels))

In [None]:
from sklearn.metrics import mean_squared_error

housing_prediction = lin_reg.predict(housing_prepared_df)
lin_mse = mean_squared_error(df_label, housing_prediction)
lin_rmse = np.sqrt(lin_mse)
lin_emse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared_df, df_label)
housing_predictions = tree_reg.predict(housing_prepared_df)
tree_mse = mean_squared_error(df_label, housing_predeictons)
tree_rmse = np.sqrt(tree_mse)
tree_rmse