In [163]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score as r2

import pandas as pd
import numpy as np

In [164]:
# Loads the California housing Dataset
housing = fetch_openml(name="house_prices", as_frame=True)  # noqa

In [165]:
housing

{'data':           Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
 0        1.0        60.0       RL         65.0   8450.0   Pave  None      Reg   
 1        2.0        20.0       RL         80.0   9600.0   Pave  None      Reg   
 2        3.0        60.0       RL         68.0  11250.0   Pave  None      IR1   
 3        4.0        70.0       RL         60.0   9550.0   Pave  None      IR1   
 4        5.0        60.0       RL         84.0  14260.0   Pave  None      IR1   
 ...      ...         ...      ...          ...      ...    ...   ...      ...   
 1455  1456.0        60.0       RL         62.0   7917.0   Pave  None      Reg   
 1456  1457.0        20.0       RL         85.0  13175.0   Pave  None      Reg   
 1457  1458.0        70.0       RL         66.0   9042.0   Pave  None      Reg   
 1458  1459.0        20.0       RL         68.0   9717.0   Pave  None      Reg   
 1459  1460.0        20.0       RL         75.0   9937.0   Pave  None      Reg   
 
      

In [166]:
data = pd.DataFrame.from_records(data= np.c_[housing['data'], housing['target']],
                    columns= housing['feature_names'] + ['target'])

In [167]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,target
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,RL,62.0,7917.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,8.0,2007.0,WD,Normal,175000.0
1456,1457.0,20.0,RL,85.0,13175.0,Pave,,Reg,Lvl,AllPub,...,0.0,,MnPrv,,0.0,2.0,2010.0,WD,Normal,210000.0
1457,1458.0,70.0,RL,66.0,9042.0,Pave,,Reg,Lvl,AllPub,...,0.0,,GdPrv,Shed,2500.0,5.0,2010.0,WD,Normal,266500.0
1458,1459.0,20.0,RL,68.0,9717.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,4.0,2010.0,WD,Normal,142125.0


Data cleaning

In [168]:
# create dummies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0

# create the new variable 'age'
data['Age'] = data.apply(lambda x: x['YrSold']-x['YearBuilt'] if (x['YearBuilt']<x['YearRemodAdd']) 
                                                           else (x['YrSold']-x['YearRemodAdd']), axis=1)

# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')]

In [169]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,target,Pool,Age
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,2.0,2008.0,WD,Normal,208500.0,False,5.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,5.0,2007.0,WD,Normal,181500.0,False,31.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,False,,0.0,9.0,2008.0,WD,Normal,223500.0,False,7.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,False,,0.0,12.0,2008.0,WD,Normal,250000.0,False,8.0
5,6.0,50.0,RL,85.0,14115.0,Pave,,IR1,Lvl,AllPub,...,True,Shed,700.0,10.0,2009.0,WD,Normal,143000.0,False,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,RL,62.0,7917.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,8.0,2007.0,WD,Normal,175000.0,False,8.0
1456,1457.0,20.0,RL,85.0,13175.0,Pave,,Reg,Lvl,AllPub,...,True,,0.0,2.0,2010.0,WD,Normal,210000.0,False,32.0
1457,1458.0,70.0,RL,66.0,9042.0,Pave,,Reg,Lvl,AllPub,...,True,Shed,2500.0,5.0,2010.0,WD,Normal,266500.0,False,69.0
1458,1459.0,20.0,RL,68.0,9717.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,4.0,2010.0,WD,Normal,142125.0,False,60.0


In [170]:
data.dtypes

Id               float64
MSSubClass       float64
MSZoning          object
LotFrontage      float64
LotArea          float64
                  ...   
SaleType          object
SaleCondition     object
target           float64
Pool                bool
Age              float64
Length: 83, dtype: object

In [171]:
for column in data.columns:
    if data[column].dtype == 'bool':
        data[column] = data[column].astype('object')

data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,target,Pool,Age
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,2.0,2008.0,WD,Normal,208500.0,False,5.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,5.0,2007.0,WD,Normal,181500.0,False,31.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,False,,0.0,9.0,2008.0,WD,Normal,223500.0,False,7.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,False,,0.0,12.0,2008.0,WD,Normal,250000.0,False,8.0
5,6.0,50.0,RL,85.0,14115.0,Pave,,IR1,Lvl,AllPub,...,True,Shed,700.0,10.0,2009.0,WD,Normal,143000.0,False,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,RL,62.0,7917.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,8.0,2007.0,WD,Normal,175000.0,False,8.0
1456,1457.0,20.0,RL,85.0,13175.0,Pave,,Reg,Lvl,AllPub,...,True,,0.0,2.0,2010.0,WD,Normal,210000.0,False,32.0
1457,1458.0,70.0,RL,66.0,9042.0,Pave,,Reg,Lvl,AllPub,...,True,Shed,2500.0,5.0,2010.0,WD,Normal,266500.0,False,69.0
1458,1459.0,20.0,RL,68.0,9717.0,Pave,,Reg,Lvl,AllPub,...,False,,0.0,4.0,2010.0,WD,Normal,142125.0,False,60.0


In [172]:
# Select Features
X = data[['Age','GrLivArea', 'LotFrontage', 'LotArea', 'GarageArea', 'Fence', 'Pool']]
Y = data[['target']]

In [173]:
X.to_csv("/home/apprenant/Documents/Projets/Projet_E2_P1/data/clean_X_Charles.csv", index=False)

In [174]:
X.head()

Unnamed: 0,Age,GrLivArea,LotFrontage,LotArea,GarageArea,Fence,Pool
0,5.0,1710.0,65.0,8450.0,548.0,False,False
1,31.0,1262.0,80.0,9600.0,460.0,False,False
2,7.0,1786.0,68.0,11250.0,608.0,False,False
4,8.0,2198.0,84.0,14260.0,836.0,False,False
5,16.0,1362.0,85.0,14115.0,480.0,True,False


In [175]:
X.dtypes

Age            float64
GrLivArea      float64
LotFrontage    float64
LotArea        float64
GarageArea     float64
Fence           object
Pool            object
dtype: object

In [176]:
Y.dtypes

target    float64
dtype: object

In [177]:
Y.head()

Unnamed: 0,target
0,208500.0
1,181500.0
2,223500.0
4,250000.0
5,143000.0


In [178]:
"""Get metrics of a series"""
Y.describe()

Unnamed: 0,target
count,1334.0
mean,184358.968516
std,78771.01565
min,39300.0
25%,132500.0
50%,166500.0
75%,216375.0
max,755000.0


In [179]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.3, random_state=1121218)

In [180]:
print(X_train.shape)
print(X_valid.shape)

(933, 7)
(401, 7)


In [181]:
y_train.describe()

Unnamed: 0,target
count,933.0
mean,185046.025723
std,78222.015955
min,39300.0
25%,131500.0
50%,169000.0
75%,219500.0
max,611657.0


In [182]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [183]:
print(numerical_features)
print(categorical_features)

['Age', 'GrLivArea', 'LotFrontage', 'LotArea', 'GarageArea']
['Fence', 'Pool']


In [184]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [185]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [186]:
randomf = RandomForestRegressor()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])

In [187]:
from sklearn.model_selection import GridSearchCV

param_dict = {
    'model__bootstrap': [True, False],
    'model__n_estimators': [50, 100, 150, 200]
}

search = GridSearchCV(randomf_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

_ = search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best score: 24329.294216158174
Best params: {'model__bootstrap': True, 'model__n_estimators': 200}


In [188]:
prices_prediction_train = search.predict(X_train)
accuracy_score_train = mean_absolute_error(y_train, prices_prediction_train)
print(accuracy_score_train)
r2_score = r2(y_train, prices_prediction_train)
print(r2_score)

8860.010362629511
0.9688534472961154


In [189]:
prices_prediction_test = search.predict(X_valid)
accuracy_score_test = mean_absolute_error(y_valid, prices_prediction_test)
print(accuracy_score_test)
r2_score = r2(y_valid, prices_prediction_test)
print(r2_score)

22371.546280133003
0.7698662932943252


In [190]:
type(search.best_estimator_.steps[1])

tuple

In [191]:
print(search.best_estimator_.steps[1][1].feature_importances_)

[2.12282642e-01 4.35584186e-01 4.34997857e-02 6.46993179e-02
 2.40577399e-01 1.55466745e-03 1.36817586e-03 3.11104737e-04
 1.22721460e-04]


In [192]:
type(print(5))

5


NoneType

In [193]:
import pickle
filename = '/home/apprenant/Documents/Projets/Projet_E2_P1/src/initial_model/finalized_model_Charles.sav'
pickle.dump(search, open(filename, 'wb'))