In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score as r2

import pandas as pd
import numpy as np

In [2]:
# Loads the California housing Dataset
housing = fetch_openml(name="house_prices", as_frame=True)  # noqa

In [3]:
data = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['target'])

In [4]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,target
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,RL,62.0,7917.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,8.0,2007.0,WD,Normal,175000.0
1456,1457.0,20.0,RL,85.0,13175.0,Pave,,Reg,Lvl,AllPub,...,0.0,,MnPrv,,0.0,2.0,2010.0,WD,Normal,210000.0
1457,1458.0,70.0,RL,66.0,9042.0,Pave,,Reg,Lvl,AllPub,...,0.0,,GdPrv,Shed,2500.0,5.0,2010.0,WD,Normal,266500.0
1458,1459.0,20.0,RL,68.0,9717.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,4.0,2010.0,WD,Normal,142125.0


In [5]:
liste_to_numeric = ['YrSold', 'YearBuilt', 'target', 'GrLivArea', 'LotFrontage', 'LotArea','GarageArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'FullBath', 'TotRmsAbvGrd', 'GarageCars']
data[liste_to_numeric] = data[liste_to_numeric].apply(pd.to_numeric)

Data cleaning

In [6]:
# create dummies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0

# create the new variable 'age'
data['Age'] = data.apply(lambda x: x['YrSold']-x['YearBuilt'] if (x['YearBuilt']<x['YearRemodAdd']) 
                                                           else (x['YrSold']-x['YearRemodAdd']), axis=1)

# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')]

# Rectify some variable names
data = data.rename(columns={'1stFlrSF':'FirstFlrSF', '2ndFlrSF':'SecondFlrSF'})

In [7]:
# Select Features
X = data[['Age','GrLivArea', 'LotArea', 'GarageArea', 'Fence', 'TotalBsmtSF', 'FirstFlrSF']]
Y = data[['target']]

In [8]:
X.to_csv('/home/marmouset/Documents/Projet_E2_P1/data/clean_X_Rachid.csv', index=False)

In [9]:
X.head()

Unnamed: 0,Age,GrLivArea,LotArea,GarageArea,Fence,TotalBsmtSF,FirstFlrSF
0,5.0,1710.0,8450.0,548.0,False,856.0,856.0
1,31.0,1262.0,9600.0,460.0,False,1262.0,1262.0
2,7.0,1786.0,11250.0,608.0,False,920.0,920.0
4,8.0,2198.0,14260.0,836.0,False,1145.0,1145.0
5,16.0,1362.0,14115.0,480.0,True,796.0,796.0


In [10]:
Y.head()

Unnamed: 0,target
0,208500.0
1,181500.0
2,223500.0
4,250000.0
5,143000.0


In [11]:
X.dtypes

Age            float64
GrLivArea      float64
LotArea        float64
GarageArea     float64
Fence             bool
TotalBsmtSF    float64
FirstFlrSF     float64
dtype: object

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.2, random_state=1121218)

In [13]:
y_train.describe()

Unnamed: 0,target
count,1067.0
mean,185582.25492
std,80653.078273
min,39300.0
25%,132000.0
50%,168000.0
75%,218750.0
max,755000.0


In [14]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [15]:
numerical_features

['Age', 'GrLivArea', 'LotArea', 'GarageArea', 'TotalBsmtSF', 'FirstFlrSF']

In [16]:
categorical_features

['Fence']

In [17]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [18]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [19]:
randomf = RandomForestRegressor()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])

In [20]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__n_estimators': [50, 100, 150, 200], 'model__max_depth' : [5, 10], 'model__min_samples_leaf': [6, 7, 8], 'model__bootstrap': [True,False]}

search = GridSearchCV(randomf_pipeline, param_dict, 
                      cv=5, 
                      scoring='neg_mean_absolute_error')

_ = search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best score: 22988.68638326726
Best params: {'model__bootstrap': True, 'model__max_depth': 10, 'model__min_samples_leaf': 6, 'model__n_estimators': 150}


In [21]:
prices_prediction = search.predict(X_train)
accuracy_score = mean_absolute_error(y_train, prices_prediction)
print(accuracy_score)
r2_score = r2(y_train, prices_prediction)
print(r2_score)

16567.935525518256
0.8791745800963088


In [22]:
prices_prediction = search.predict(X_valid)
accuracy_score = mean_absolute_error(y_valid, prices_prediction)
print(accuracy_score)
r2_score = r2(y_valid, prices_prediction)
print(r2_score)

18806.659878472237
0.8455596126958933


In [23]:
type(search.best_estimator_.steps[1])

tuple

In [24]:
print(search.best_estimator_.steps[1][1].feature_importances_)

[2.33496971e-01 3.34624424e-01 2.90424426e-02 2.10040170e-01
 1.36506156e-01 5.59499262e-02 1.74736272e-04 1.65174323e-04]


In [26]:
import pickle
filename = '/home/marmouset/Documents/Projet_E2_P1/src/optimized_model/finalized_model_Rachid.sav'
pickle.dump(search, open(filename, 'wb'))