In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score as r2

import pandas as pd
import numpy as np

In [2]:
# Loads the California housing Dataset
housing = fetch_openml(name="house_prices", as_frame=True)  # noqa

In [3]:
data = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['target'])

In [4]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,target
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,RL,62.0,7917.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,8.0,2007.0,WD,Normal,175000.0
1456,1457.0,20.0,RL,85.0,13175.0,Pave,,Reg,Lvl,AllPub,...,0.0,,MnPrv,,0.0,2.0,2010.0,WD,Normal,210000.0
1457,1458.0,70.0,RL,66.0,9042.0,Pave,,Reg,Lvl,AllPub,...,0.0,,GdPrv,Shed,2500.0,5.0,2010.0,WD,Normal,266500.0
1458,1459.0,20.0,RL,68.0,9717.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,4.0,2010.0,WD,Normal,142125.0


In [5]:
liste_to_numeric = ['YrSold', 'YearBuilt', 'target', 'GrLivArea', 'LotFrontage', 'LotArea','GarageArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'FullBath', 'TotRmsAbvGrd', 'GarageCars']
data[liste_to_numeric] = data[liste_to_numeric].apply(pd.to_numeric)

Data cleaning

In [6]:
# create dummies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0

# create the new variable 'age'
data['Age'] = data.apply(lambda x: x['YrSold']-x['YearBuilt'] if (x['YearBuilt']<x['YearRemodAdd']) 
                                                           else (x['YrSold']-x['YearRemodAdd']), axis=1)

# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')]

# Rectify some variable names
data = data.rename(columns={'1stFlrSF':'FirstFlrSF', '2ndFlrSF':'SecondFlrSF'})

In [7]:
# Select Features
X = data[['Age','GrLivArea', 'LotFrontage', 'LotArea', 'GarageArea', 'Fence', 'Pool', 'YrSold', 'YearBuilt', 'TotalBsmtSF', 'FirstFlrSF', 'SecondFlrSF', 'FullBath', 'TotRmsAbvGrd', 'GarageCars']]
Y = data[['target']]

In [8]:
X.to_csv('/home/apprenant/Documents/Projets/Projet_E2_P1/data/clean_X_Rachid.csv', index=False)

In [9]:
X.head()

Unnamed: 0,Age,GrLivArea,LotFrontage,LotArea,GarageArea,Fence,Pool,YrSold,YearBuilt,TotalBsmtSF,FirstFlrSF,SecondFlrSF,FullBath,TotRmsAbvGrd,GarageCars
0,5.0,1710.0,65.0,8450.0,548.0,False,False,2008.0,2003.0,856.0,856.0,854.0,2.0,8.0,2.0
1,31.0,1262.0,80.0,9600.0,460.0,False,False,2007.0,1976.0,1262.0,1262.0,0.0,2.0,6.0,2.0
2,7.0,1786.0,68.0,11250.0,608.0,False,False,2008.0,2001.0,920.0,920.0,866.0,2.0,6.0,2.0
4,8.0,2198.0,84.0,14260.0,836.0,False,False,2008.0,2000.0,1145.0,1145.0,1053.0,2.0,9.0,3.0
5,16.0,1362.0,85.0,14115.0,480.0,True,False,2009.0,1993.0,796.0,796.0,566.0,1.0,5.0,2.0


In [10]:
Y.head()

Unnamed: 0,target
0,208500.0
1,181500.0
2,223500.0
4,250000.0
5,143000.0


In [11]:
X.dtypes

Age             float64
GrLivArea       float64
LotFrontage     float64
LotArea         float64
GarageArea      float64
Fence              bool
Pool               bool
YrSold          float64
YearBuilt       float64
TotalBsmtSF     float64
FirstFlrSF      float64
SecondFlrSF     float64
FullBath        float64
TotRmsAbvGrd    float64
GarageCars      float64
dtype: object

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.3, random_state=1121218)

In [13]:
y_train.describe()

Unnamed: 0,target
count,933.0
mean,185046.025723
std,78222.015955
min,39300.0
25%,131500.0
50%,169000.0
75%,219500.0
max,611657.0


In [14]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [15]:
numerical_features

['Age',
 'GrLivArea',
 'LotFrontage',
 'LotArea',
 'GarageArea',
 'YrSold',
 'YearBuilt',
 'TotalBsmtSF',
 'FirstFlrSF',
 'SecondFlrSF',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageCars']

In [16]:
categorical_features

['Fence', 'Pool']

In [17]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [18]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [19]:
randomf = RandomForestRegressor()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])

In [20]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__n_estimators': [20, 30, 40, 50], 'model__bootstrap': [True,False]}

search = GridSearchCV(randomf_pipeline, param_dict, 
                      cv=5, 
                      scoring='neg_mean_absolute_error')

_ = search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best score: 22468.203537270234
Best params: {'model__bootstrap': True, 'model__n_estimators': 30}


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [21]:
prices_prediction = search.predict(X_train)
accuracy_score = mean_absolute_error(y_train, prices_prediction)
print(accuracy_score)
r2_score = r2(y_train, prices_prediction)
print(r2_score)

8420.93197570561
0.9688954897777444


In [22]:
prices_prediction = search.predict(X_valid)
accuracy_score = mean_absolute_error(y_valid, prices_prediction)
print(accuracy_score)
r2_score = r2(y_valid, prices_prediction)
print(r2_score)

21217.85868661679
0.7619278720153033


In [23]:
type(search.best_estimator_.steps[1])

tuple

In [24]:
print(search.best_estimator_.steps[1][1].feature_importances_)

[3.26785015e-02 2.20837726e-01 1.82760186e-02 2.82005862e-02
 3.02098071e-02 6.00027510e-03 7.91187661e-02 8.74155737e-02
 4.69605270e-02 1.85868116e-02 3.39864175e-02 2.02483440e-02
 3.75912064e-01 7.65300424e-04 3.85145341e-04 1.92628138e-04
 2.25508100e-04]


In [None]:
import pickle
filename = '/home/apprenant/Documents/Projets/Projet_E2_P1/src/optimized_model/finalized_model_Rachid.sav'
pickle.dump(search, open(filename, 'wb'))