In [31]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import pycaret
from pycaret.regression import *

In [32]:
# DOWNLOAD DATA
df = pd.read_csv('AmesHousing.csv')
data = df.drop(['PID'], axis=1)


In [33]:
data = data.rename(columns={"MS SubClass": "MSSubClass", "MS Zoning": "MSZoning",'Lot Frontage':'LotFrontage', 'Lot Area':'LotArea',
       'Sale Condition':'SaleCondition','Yr Sold':'YrSold','YearBuilt': 'YearBuilt', 'Gr Liv Area': 'GrLivArea', 'Garage Area':'GarageArea',
       'Year Built':'YearBuilt','Pool Area':'PoolArea','Year Remod/Add':'YearRemodAdd','Garage Cars':'GarageCars','Total Bsmt SF':"TotalBsmtSF",
       "Full Bath":"FullBath","Mas Vnr Area":"MasVnrArea","1st Flr SF":"FstFlrSF","Garage Yr Blt":"GarageYrBlt","TotRms AbvGrd":"TotRmsAbvGrd",
       "Overall Qual":"OverallQual","Screen Porch":"ScreenPorch","Open Porch SF":"OpenPorchSF","Enclosed Porch":"EnclosedPorch",
       "Wood Deck SF":"WoodDeckSF","2nd Flr SF":"SndFlrSF","Bedroom AbvGr":"BedroomAbvGr","Kitchen AbvGr":"KitchenAbvGr",
})

Data cleaning

In [34]:
data =data.fillna(0)

# create dummies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0


# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')] 

Selection des features

In [35]:
columns_to_keep=["YearBuilt","YearRemodAdd","Pool","Fence","ScreenPorch","OpenPorchSF","EnclosedPorch",
"WoodDeckSF","GarageArea","LotArea","TotalBsmtSF","FstFlrSF","SndFlrSF","GrLivArea","FullBath","BedroomAbvGr",
"KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageCars","LotFrontage",'SalePrice']

In [36]:
# Select Features
data = data[columns_to_keep]

X = data.drop('SalePrice', axis=1)
Y = data[['SalePrice']]

Preprocessing

In [37]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.3, random_state=1121218)

In [38]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [39]:
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(data[numerical_features])
data[numerical_features] = imputer.transform(data[numerical_features])

In [40]:
sclaer = MinMaxScaler()
sclaer = sclaer.fit(data[numerical_features])
data[numerical_features] = sclaer.transform(data[numerical_features])

In [41]:
labelencoder = LabelEncoder()
data['Fence'] = labelencoder.fit_transform(data['Fence'])
data['Pool'] = labelencoder.fit_transform(data['Pool'])

Le modele

In [None]:
reg = setup(data = data, target = 'SalePrice')

In [None]:
rfr = create_model('rf')

In [None]:
tuned_rfr = tune_model(rfr,n_iter = 50)

In [53]:
print(tuned_rfr)

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0005,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=180, n_jobs=-1, oob_score=False,
                      random_state=7763, verbose=0, warm_start=False)


In [54]:
_ = tuned_rfr.fit(X_train, y_train)

In [55]:
filename = 'last_model.sav'
pickle.dump(tuned_rfr, open(filename, 'wb'))

Evaluation du modele

In [56]:
def evaluate_Regression_models(model,x_test,y_test):
  prediction = model.predict(x_test)
  print("Mean Absolute Error:",
                       mean_absolute_error(y_test,prediction))
  print("Mean Squared Error : ", 
                       mean_squared_error(y_test,prediction))
  print("Root Mean Squared Error : ",
                     np.sqrt(mean_squared_error(y_test,prediction)))
  print("R2 Score : ",r2_score(y_test,prediction))

In [57]:
my_model = pickle.load(open("last_model.sav", 'rb'))

In [58]:
predict_on_Xtrain = my_model.predict(X_train)
predict_on_Xvalid = my_model.predict(X_valid)

In [59]:
evaluate_Regression_models(my_model,X_valid,y_valid)

Mean Absolute Error: 18069.992218583837
Mean Squared Error :  764508084.6178582
Root Mean Squared Error :  27649.7393227831
R2 Score :  0.8827634500145207
