In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('AmesHousing.csv')
data = df.drop(['PID'], axis=1)

In [3]:
data = data.rename(columns={"MS SubClass": "MSSubClass", "MS Zoning": "MSZoning",'Lot Frontage':'LotFrontage', 'Lot Area':'LotArea',
       'Sale Condition':'SaleCondition','Yr Sold':'YrSold','YearBuilt': 'YearBuilt', 'Gr Liv Area': 'GrLivArea', 'Garage Area':'GarageArea',
       'Year Built':'YearBuilt','Pool Area':'PoolArea','Year Remod/Add':'YearRemodAdd'
})

In [4]:
liste_to_numeric = ['YrSold', 'YearBuilt', 'GrLivArea', 'LotFrontage', 'LotArea','GarageArea']
data[liste_to_numeric] = data[liste_to_numeric].apply(pd.to_numeric)


In [5]:
data = data*1

Data cleaning

In [6]:
# create dumies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0

# create the new variable 'age'
data['Age'] = data.apply(lambda x: x['YrSold']-x['YearBuilt'] if (x['YearBuilt']<x['YearRemodAdd']) 
                                                           else (x['YrSold']-x['YearRemodAdd']), axis=1)

# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')] 


In [9]:
# Select Features
X = data[['Age','GrLivArea', 'LotFrontage', 'LotArea', 'GarageArea', 'Fence', 'Pool']]
Y = data[['SalePrice']]

In [10]:
# Save the clean version
X.to_csv('clean_X.csv', index=False)

In [11]:
X.head()

Unnamed: 0,Age,GrLivArea,LotFrontage,LotArea,GarageArea,Fence,Pool
0,50,1656,141.0,31770,528.0,False,False
1,49,896,80.0,11622,730.0,True,False
2,52,1329,81.0,14267,312.0,False,False
3,42,2110,93.0,11160,522.0,False,False
4,13,1629,74.0,13830,482.0,True,False


In [12]:
Y.head()

Unnamed: 0,SalePrice
0,215000
1,105000
2,172000
3,244000
4,189900


In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.3, random_state=1121218)

In [14]:
X_valid.to_csv("x_valid.csv", index = False)
y_valid.to_csv("y_valid.csv", index = False)

In [15]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [16]:
numerical_features

['Age', 'GrLivArea', 'LotFrontage', 'LotArea', 'GarageArea']

In [17]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [18]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [19]:
randomf = RandomForestRegressor()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])

In [20]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__n_estimators': [50,100,150,200], 'model__bootstrap': [True,False]}

search = GridSearchCV(randomf_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error'
                      )

_ = search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best score: 21797.064492341506
Best params: {'model__bootstrap': True, 'model__n_estimators': 200}


In [21]:
def evaluate_Regression_models(model,x_test,y_test):
  prediction = model.predict(x_test)
  print("Mean Absolute Error:",
                       mean_absolute_error(y_test,prediction))
  print("Mean Squared Error : ", 
                       mean_squared_error(y_test,prediction))
  print("Root Mean Squared Error : ",
                     np.sqrt(mean_squared_error(y_test,prediction)))
  print("R2 Score : ",r2_score(y_test,prediction))

In [22]:
evaluate_Regression_models(search,X_valid,y_valid)

Mean Absolute Error: 23205.522788748545
Mean Squared Error :  1223747107.9232688
Root Mean Squared Error :  34982.09696292189
R2 Score :  0.8123396051994075


In [23]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(search, open(filename, 'wb'))