In [15]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('AmesHousing.csv')
data = df.drop(['PID'], axis=1)

In [17]:
data = data.rename(columns={"MS SubClass": "MSSubClass", "MS Zoning": "MSZoning",'Lot Frontage':'LotFrontage', 'Lot Area':'LotArea',
       'Sale Condition':'SaleCondition','Yr Sold':'YrSold','YearBuilt': 'YearBuilt', 'Gr Liv Area': 'GrLivArea', 'Garage Area':'GarageArea',
       'Year Built':'YearBuilt','Pool Area':'PoolArea','Year Remod/Add':'YearRemodAdd','Garage Cars':'GarageCars','Total Bsmt SF':"TotalBsmtSF",
       "Full Bath":"FullBath","Mas Vnr Area":"MasVnrArea","1st Flr SF":"FstFlrSF","Garage Yr Blt":"GarageYrBlt","TotRms AbvGrd":"TotRmsAbvGrd",
       "Overall Qual":"OverallQual","Screen Porch":"ScreenPorch","Open Porch SF":"OpenPorchSF","Enclosed Porch":"EnclosedPorch",
       "Wood Deck SF":"WoodDeckSF","2nd Flr SF":"SndFlrSF","Bedroom AbvGr":"BedroomAbvGr","Kitchen AbvGr":"KitchenAbvGr",
})


Data cleaning

In [18]:
# create dumies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0

# create the new variable 'age'
data['Age'] = data.apply(lambda x: x['YrSold']-x['YearBuilt'] if (x['YearBuilt']<x['YearRemodAdd']) 
                                                           else (x['YrSold']-x['YearRemodAdd']), axis=1)

# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')] 


In [19]:
columns_to_keep=["YearBuilt","YearRemodAdd","Pool","Fence","ScreenPorch","OpenPorchSF","EnclosedPorch",
"WoodDeckSF","GarageArea","LotArea","TotalBsmtSF","FstFlrSF","SndFlrSF","GrLivArea","FullBath","BedroomAbvGr",
"KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageCars","LotFrontage"]

In [22]:
# Select Features
X = data[columns_to_keep]
Y = data[['SalePrice']]

In [23]:
# Save the clean version
X.to_csv('clean_X.csv', index=False)

In [24]:
X.head()

Unnamed: 0,YearBuilt,YearRemodAdd,Pool,Fence,ScreenPorch,OpenPorchSF,EnclosedPorch,WoodDeckSF,GarageArea,LotArea,...,FstFlrSF,SndFlrSF,GrLivArea,FullBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,LotFrontage
0,1960,1960,False,False,0,62,0,210,528.0,31770,...,1656,0,1656,1,3,1,7,2,2.0,141.0
1,1961,1961,False,True,120,0,0,140,730.0,11622,...,896,0,896,1,2,1,5,0,1.0,80.0
2,1958,1958,False,False,0,36,0,393,312.0,14267,...,1329,0,1329,1,3,1,6,0,1.0,81.0
3,1968,1968,False,False,0,0,0,0,522.0,11160,...,2110,0,2110,2,3,1,8,2,2.0,93.0
4,1997,1998,False,True,0,34,0,212,482.0,13830,...,928,701,1629,2,3,1,6,1,2.0,74.0


In [25]:
Y.head()

Unnamed: 0,SalePrice
0,215000
1,105000
2,172000
3,244000
4,189900


In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.3, random_state=1121218)

In [27]:
X_valid.to_csv("x_valid.csv", index = False)
y_valid.to_csv("y_valid.csv", index = False)

In [28]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [30]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [31]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [32]:
randomf = RandomForestRegressor()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])

In [33]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__n_estimators': [50,100,150,200], 'model__bootstrap': [True,False]}

search = GridSearchCV(randomf_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error'
                      )

_ = search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best score: 18291.531659170825
Best params: {'model__bootstrap': True, 'model__n_estimators': 100}


In [34]:
def evaluate_Regression_models(model,x_test,y_test):
  prediction = model.predict(x_test)
  print("Mean Absolute Error:",
                       mean_absolute_error(y_test,prediction))
  print("Mean Squared Error : ", 
                       mean_squared_error(y_test,prediction))
  print("Root Mean Squared Error : ",
                     np.sqrt(mean_squared_error(y_test,prediction)))
  print("R2 Score : ",r2_score(y_test,prediction))

In [35]:
evaluate_Regression_models(search,X_valid,y_valid)

Mean Absolute Error: 18561.062979204966
Mean Squared Error :  806973919.1141615
Root Mean Squared Error :  28407.286373642968
R2 Score :  0.8762513567760437


In [36]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(search, open(filename, 'wb'))