In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectFromModel

%matplotlib inline


In [2]:
def load_the_data(path):
    return pd.read_csv(path)

### Take a quick look at the data

In [3]:
path = r'C:\Users\kk\Documents\Python Projects\California Housing Price\handson-ml\datasets\housing\housing.csv'
df = load_the_data(path)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.ocean_proximity.unique()

In [None]:
df.ocean_proximity.value_counts()

In [None]:
df.hist(bins=50, figsize = (20,15))

In [None]:
len(df[df.median_house_value.between(99000, 110000, inclusive=False)])

### create a test set

#### stratified shuffle split

In [4]:
#train & test set which we will be use in the project
df['median_income_cat'] = np.ceil(df.median_income/1.5)
df['median_income_cat'] = df.median_income_cat.where((df.median_income_cat)<5,5)

split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(df,df.median_income_cat):
    strat_train_set= df.loc[train_index]
    strat_test_set= df.loc[test_index]

### Discover and Visualize

In [5]:
housing = strat_train_set.copy()
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'median_income_cat'],
      dtype='object')

#s- shows popultion and c-shows a median house value
housing.plot(kind = 'scatter',x='longitude',y='latitude',alpha=0.1,s=housing['population']/100,label='population',c='median_house_value',cmap=plt.get_cmap('jet'),figsize=(10,7))

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
att = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(strat_train_set[att])

In [None]:
housing.plot(kind='scatter',x='median_house_value',y='median_income')

In [None]:
housing.corr()

### train set and labels

In [6]:
housing = strat_train_set.drop('median_house_value',axis=1)
housing_label = strat_train_set['median_house_value'].copy()

## Class & Pipelines

In [7]:
housing_num = housing.select_dtypes('float64')
housing_cat = housing['ocean_proximity']

In [26]:
rooms_ix,bedrooms_ix,population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X, y=None):
        return self
    def transform(self,X, y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household]
        
            
class RemoveNaN(TransformerMixin):
    def __init__(self,filling_strategy = 'median'):
        self.filling_strategy = filling_strategy
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        if self.filling_strategy == 'median':
            for col in X.select_dtypes('float64'):
                col_median = X[col].median()
                X[col].fillna(col_median)
            return X
                
        else:
            for col in X.select_dtypes('float64'):
                col_mean = X[col].mean()
                X[col].fillna(col_mean)
            return X 
        

class ChangeObjectToInt:
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        for col in X.select_dtypes('object'):
            cat_to_int = pd.get_dummies(X[col],prefix = col)
            X = X.join(cat_to_int)
            X = X.drop(col,axis=1)
        return X
    
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attributes_name):
        self.attributes_name = attributes_name
        
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        
        return X[self.attributes_name].values

#### Transformation Pipelines


In [35]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=2, random_state=0, n_jobs=-1)
sfm = SelectFromModel(clf, threshold=0.1)

num_attr = list(housing_num)
cat_attr = ['ocean_proximity']
num_pipeline = Pipeline([('selector',DataFrameSelector(num_attr)),
                         ('imputer',Imputer(strategy='median')),
                         #('attribs_adder',CombinedAttributesAdder()),
                         ('std_scaler',StandardScaler())
                        ])

cat_pipeline = Pipeline([('selector',DataFrameSelector(cat_attr)),
                         ('onehot',OneHotEncoder())
                        ])
## to connect two pipelines in one
full_pipeline = FeatureUnion(transformer_list=[('num_pipeline',num_pipeline),('cat_pipeline',cat_pipeline)])


fpipe = Pipeline([
    ('full', full_pipeline),
    ('selector', sfm)
                 
                 ])




housing_prepared = fpipe.fit_transform(housing, housing_label)

housing_prepared.shape



(16512, 8)

### Select and Train Model

In [11]:
lin_reg = LinearRegression()
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor()
svm = SVR()

model_list = [lin_reg, decision_tree, random_forest,svm]
rmse_results = []
param_grid =[{'normalize':[True,False]},
             {'min_samples_split':[4,12],'max_features':[2,4,6,8]},
             {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
             {'C': [1, 10, 100, 1000], 'kernel': ['linear','rbf']}]

param = [{'normalize':[True,False]},
         {'min_samples_split':randint(4,12),'max_features':randint(2,8)},
         {'n_estimators':randint(10,30),'max_features':randint(2,8)},
         {'C': randint(500,2000), 'kernel': ['linear','rbf'],'gamma':[0.0001,0.0005,0.001]}]

for c,model in enumerate(model_list):
    #rmse on training set
    model.fit(housing_prepared,housing_label)
    housing_prediction = model.predict(housing_prepared)
    rmse_training = np.sqrt(mean_squared_error(housing_label,housing_prediction))
    #rmse on validation set
    scores = cross_val_score(model,housing_prepared,housing_label,scoring='neg_mean_squared_error')
    rmse_validation = np.sqrt(-scores)
    #grid serach
    grid_search = GridSearchCV(model,param_grid[c],scoring='neg_mean_squared_error')
    grid_search.fit(housing_prepared,housing_label)
    grid_results = grid_search.cv_results_
    rmse_grid = min(np.sqrt(-grid_results['mean_test_score']))
    #randomized search
    random_search = RandomizedSearchCV(model,param[c],scoring='neg_mean_squared_error')
    random_search.fit(housing_prepared,housing_label)
    random_results = random_search.cv_results_
    rmse_random = min(np.sqrt(-random_results['mean_test_score']))
    rmse_results.append({'model_name':str(model_list[c]).split('(')[0],'val_rmse:':np.mean(rmse_validation),'train_rmse':rmse_training,'rmse_grid':rmse_grid, 'rmse_randomized':rmse_random})
pd.DataFrame(rmse_results)
    

    



Unnamed: 0,model_name,val_rmse:,train_rmse,rmse_grid,rmse_randomized
0,LinearRegression,70317.697529,69958.399403,70317.812617,70317.812617
1,DecisionTreeRegressor,71901.260736,0.0,65753.272205,67227.88348
2,RandomForestRegressor,53549.689641,22229.40015,51035.625839,51495.295941
3,SVR,118699.611696,118586.110345,71432.57937,71319.844179


In [12]:
rmse = pd.DataFrame(rmse_results)
rmse.to_csv('rmse_train.csv')

## Test 

In [56]:
#everything in one pipeline: data preparation and model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=2, random_state=0, n_jobs=-1)
sfm = SelectFromModel(clf, threshold=0.1)
random_forest = RandomForestRegressor(max_features = 6, n_estimators = 30)

num_attr = list(housing_num)
cat_attr = ['ocean_proximity']
num_pipeline = Pipeline([('selector',DataFrameSelector(num_attr)),
                         ('imputer',Imputer(strategy='median')),
                         #('attribs_adder',CombinedAttributesAdder()),
                         ('std_scaler',StandardScaler())
                        ])

cat_pipeline = Pipeline([('selector',DataFrameSelector(cat_attr)),
                         ('onehot',OneHotEncoder())
                        ])
## to connect two pipelines in one
full_pipeline = FeatureUnion(transformer_list=[('num_pipeline',num_pipeline),('cat_pipeline',cat_pipeline)])


fpipe = Pipeline([
    ('full', full_pipeline),
    ('selector', sfm),
    ('rf',random_forest)
                 
                 ])




housing_prepared = fpipe.fit(housing, housing_label)




In [55]:
housing_prediction = housing_prepared.predict(housing_test)
rmse_test = np.sqrt(mean_squared_error(housing_label_test,housing_prediction))
rmse_test

48508.85670670461

In [57]:
housing_prepared

Pipeline(memory=None,
         steps=[('full',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('num_pipeline',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  DataFrameSelector(attributes_name=['longitude',
                                                                                                     'latitude',
                                                                                                     'housing_median_age',
                                                                                                     'total_rooms',
                                                                                                     'total_bedrooms',
                                                                                                     'populatio

In [32]:
#pipeline for data preparation plus seperate model
housing_test = strat_test_set.copy()

housing_test = strat_test_set.drop('median_house_value',axis=1)
housing_label_test = strat_test_set['median_house_value'].copy()

housing_prepared_test = fpipe.transform(housing_test)

random_forest = RandomForestRegressor(max_features = 6, n_estimators = 30)
random_forest.fit(housing_prepared,housing_label)
housing_prediction = random_forest.predict(housing_prepared_test)
rmse_test = np.sqrt(mean_squared_error(housing_label_test,housing_prediction))
rmse_test


48377.299927407425

In [20]:
for score, param in zip(grid_results['mean_test_score'],grid_results['params']):
    print(np.sqrt(-score),param)

63913.79671265874 {'max_features': 2, 'n_estimators': 3}
56273.68590309125 {'max_features': 2, 'n_estimators': 10}
53904.3614641382 {'max_features': 2, 'n_estimators': 30}
60732.92013240154 {'max_features': 4, 'n_estimators': 3}
53003.4300213179 {'max_features': 4, 'n_estimators': 10}
51240.26690111831 {'max_features': 4, 'n_estimators': 30}
60942.79054925182 {'max_features': 6, 'n_estimators': 3}
52673.35189717745 {'max_features': 6, 'n_estimators': 10}
51095.83680836735 {'max_features': 6, 'n_estimators': 30}
60451.73527219596 {'max_features': 8, 'n_estimators': 3}
53621.732746493304 {'max_features': 8, 'n_estimators': 10}
51817.15315956504 {'max_features': 8, 'n_estimators': 30}


## Exercises

### SVM


In [None]:
params =  [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
grid_svm = GridSearchCV(svm,params,cv=5,scoring='neg_mean_squared_error')
grid_svm.fit(housing_prepared,housing_label)
grid_results = grid_svm.cv_results_
rmse_svr = min(np.sqrt(-grid_results['mean_test_score']))

In [None]:
for mean, param in zip(np.sqrt(-grid_results['mean_test_score']),grid_results['params']):
    print(mean,param)

In [None]:
pd.DataFrame(rmse_results)

## GridSearchCV for transformer -> CombinedAttributesAdder

In [None]:
# create pipeline with a scaler 
combined = CombinedAttributesAdder()
steps = [('combined_attributes',combined),('random_forest',random_forest)]
pipeline = Pipeline(steps)
#parameters
param =  {'combined_attributes__add_bedrooms_per_room': [True,False]}
#do search
grid_search_extra_param = GridSearchCV(pipeline,param,cv=5,scoring='neg_mean_squared_error')
grid_search_extra_param.fit(housing_prepared.toarray(), housing_label)
grid_results = grid_search_extra_param.cv_results_
grid_results['mean_test_score']
rmse_grid_extra_param = min(np.sqrt(-grid_results['mean_test_score']))
rmse_grid_extra_param



