In [149]:
# Maintaning all the libraries 

import os 
import datetime
import tarfile
from six.moves import urllib 
import time
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score , cross_val_predict, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.externals import joblib

In [2]:
os.path.exists(os.path.join('.', 'DataUniverse')) 

True

In [3]:
download_root = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz'

In [4]:
def fetch_data_url (URL , name_dataset = 'housing_data'):
    end_location = os.path.join('.', 'DataUniverse', name_dataset)
    if not os.path.exists(end_location): 
        os.makedirs(end_location)
    tgz_location = os.path.join(end_location, name_dataset + '.tgz')
    urllib.request.urlretrieve(URL, tgz_location)
    data_file = tarfile.open(tgz_location)
    data_file.extractall(path = end_location) 
    data_file.close() 
    

In [5]:
housing_df = pd.read_csv('/Users/hitulshah/DataUniverse/housing_data/housing.csv')

In [8]:
def stratified_split(df , target_column, TEST_SIZE = 0.3):
    split = StratifiedShuffleSplit(n_splits = 1, test_size= TEST_SIZE)
    data = df.copy()
    data['strat_var'] = np.ceil(data[target_column] / 100000)
    for train_index, test_index in split.split(data,data['strat_var']):
        data_train  = data.iloc[train_index].drop(columns= ['strat_var']) 
        data_test = data.iloc[test_index].drop(columns = ['strat_var']) 
    del data
    print('Training Data Size{}'.format(data_train.shape))
    print('Testing data Size{}'.format(data_test.shape))
    return data_train.drop(columns = target_column), data_test.drop(columns = target_column), data_train[target_column], data_test[target_column]

In [9]:
target = 'median_house_value'
housing_feat_train, housing_feat_test, housing_train_target, housing_test_taget = stratified_split(housing_df, target_column= target) 

Training Data Size(14448, 10)
Testing data Size(6192, 10)


In [69]:
class ColumnSelector(BaseEstimator,TransformerMixin): 

    '''
    This substract class is designed to return data with inputed dcolumns
    '''
    def __init__(self, features_ = None, numeric = True):
        self.features_ = features_
        
    def fit(self, x, Y = None): 
        return self
    
    def transform(self, x, Y = None): 
        return x[self.features_]
        

In [102]:
full_pipeline= FeatureUnion([
                            ('num_pipeline',num_pipe),
                            ('cat_pipeline', cat_pipe)
                            ])

In [112]:
def Preprocessing(numeric_variables, cat_variables):
    
    num_pipe = Pipeline([
                        ('selector', ColumnSelector(num_var)), 
                        ('imputor', SimpleImputer(strategy= 'median')),
                        ('scaling', StandardScaler())
                ]);
    
    cat_pipe = Pipeline([
                    ('selector', ColumnSelector(cat_var)), 
                    ('imputor_cat', SimpleImputer(strategy= 'most_frequent')),
                    ('encoder', OrdinalEncoder()), 
                    ('onehot', OneHotEncoder(categories= 'auto',sparse=False))
                    ]);
    
    full_pipeline= FeatureUnion([
                            ('num_pipeline',num_pipe),
                            ('cat_pipeline', cat_pipe)
                            ]);


    return full_pipeline

In [113]:
num_var = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms'
           ,'population','households','median_income']
cat_var = ['ocean_proximity']

In [118]:
full_pipeline = Preprocessing(num_var , cat_var)
housing_feat_transf_train = full_pipeline.fit_transform(housing_feat_train)

In [150]:
rf_reg = RandomForestRegressor()

In [153]:
paras = {'n_estimators': [10,20,30], 'max_features': [5,7,10] , 'bootstrap' : [True, False]}

In [161]:
grid = GridSearchCV(rf_reg,param_grid= paras , scoring = 'neg_mean_squared_error',cv= 10)

In [163]:
grid.fit(housing_feat_transf_train, housing_train_target)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 20, 30], 'max_features': [5, 7, 10], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [165]:
grid.best_params_

{'bootstrap': False, 'max_features': 7, 'n_estimators': 30}

In [166]:
cvres = grid.cv_results_

In [170]:
sorted(zip(cvres['mean_test_score'], cvres['params']))

[(-2751776193.9308467,
  {'bootstrap': False, 'max_features': 10, 'n_estimators': 10}),
 (-2695567274.639811,
  {'bootstrap': False, 'max_features': 10, 'n_estimators': 20}),
 (-2691886490.8124814,
  {'bootstrap': True, 'max_features': 10, 'n_estimators': 10}),
 (-2667979508.39847,
  {'bootstrap': True, 'max_features': 5, 'n_estimators': 10}),
 (-2659645454.57189,
  {'bootstrap': False, 'max_features': 5, 'n_estimators': 10}),
 (-2654673067.014958,
  {'bootstrap': True, 'max_features': 7, 'n_estimators': 10}),
 (-2625792841.711316,
  {'bootstrap': False, 'max_features': 7, 'n_estimators': 10}),
 (-2606775091.927631,
  {'bootstrap': False, 'max_features': 10, 'n_estimators': 30}),
 (-2530155879.8460236,
  {'bootstrap': True, 'max_features': 10, 'n_estimators': 20}),
 (-2514053100.091453,
  {'bootstrap': True, 'max_features': 5, 'n_estimators': 20}),
 (-2506343467.2295127,
  {'bootstrap': True, 'max_features': 7, 'n_estimators': 20}),
 (-2486614947.961313,
  {'bootstrap': False, 'max_fea

In [171]:
onehot

NameError: name 'onehot' is not defined