In [1]:

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import arange
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
df_train = pd.read_csv("./dataset/train.csv")
df_train.shape

(1460, 81)

In [3]:
df_test = pd.read_csv("./dataset/test.csv")
df_test.shape

(1459, 80)

### variable numérique

In [4]:
df_train['LotArea'].isna().sum()
df_train['LotArea'].describe()
#df['LotArea'].unique()

count      1460.000000
mean      10516.828082
std        9981.264932
min        1300.000000
25%        7553.500000
50%        9478.500000
75%       11601.500000
max      215245.000000
Name: LotArea, dtype: float64

In [5]:
df_train.loc[df_train['TotalBsmtSF'] == 0, 'TotalBsmtSF'] = df_train['TotalBsmtSF'].quantile(0.5)
#df['TotalBsmtSF'].describe()

In [6]:
#df[['GrLivArea']].fillna((df['GrLivArea'].mean()), inplace=False)
df_train_num = df_train[['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea']] 

df_train_num.head()

Unnamed: 0,GrLivArea,GarageArea,TotalBsmtSF,1stFlrSF,LotArea
0,1710,548,856.0,856,8450
1,1262,460,1262.0,1262,9600
2,1786,608,920.0,920,11250
3,1717,642,756.0,961,9550
4,2198,836,1145.0,1145,14260


In [7]:
scaler = MinMaxScaler()
df_train_num_scaled = scaler.fit_transform(df_train_num)

scaler = StandardScaler().fit(df_train_num_scaled)
df_train_num_scaled = scaler.transform(df_train_num_scaled)

In [8]:
df_train_num = pd.DataFrame(df_train_num_scaled, columns = df_train_num.columns.tolist())

### Variable cétégorielle

In [9]:
df_train_cat = df_train[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_train_cat.describe()

Unnamed: 0,OverallQual,GarageCars,FullBath,TotRmsAbvGrd
count,1460,1460,1460,1460
unique,10,5,4,12
top,5,2,2,6
freq,397,824,768,402


In [10]:
df_test_cat = df_test[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_test_cat.describe()

Unnamed: 0,OverallQual,GarageCars,FullBath,TotRmsAbvGrd
count,1459,1459.0,1459,1459
unique,10,7.0,5,12
top,5,2.0,2,6
freq,428,770.0,762,442


Il faut concaténer les datasets train et test pour avoir toutes les "dummies"

In [11]:
df_train_test_cat = pd.concat([df_train_cat, df_test_cat])
df_train_test_cat.shape

(2919, 4)

In [12]:
df_train_test_cat = pd.get_dummies(df_train_test_cat)
df_train_test_cat.head()

Unnamed: 0,OverallQual_1,OverallQual_10,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,...,TotRmsAbvGrd_14,TotRmsAbvGrd_15,TotRmsAbvGrd_2,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
df_new = df_train_num.join(df_train_test_cat.head(1460))
df_new.shape

(1460, 46)

In [14]:
df_new.columns

Index(['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea',
       'OverallQual_1', 'OverallQual_10', 'OverallQual_2', 'OverallQual_3',
       'OverallQual_4', 'OverallQual_5', 'OverallQual_6', 'OverallQual_7',
       'OverallQual_8', 'OverallQual_9', 'GarageCars_0', 'GarageCars_0.0',
       'GarageCars_1', 'GarageCars_1.0', 'GarageCars_2', 'GarageCars_2.0',
       'GarageCars_3', 'GarageCars_3.0', 'GarageCars_4', 'GarageCars_4.0',
       'GarageCars_5.0', 'GarageCars_nan', 'FullBath_0', 'FullBath_1',
       'FullBath_2', 'FullBath_3', 'FullBath_4', 'TotRmsAbvGrd_10',
       'TotRmsAbvGrd_11', 'TotRmsAbvGrd_12', 'TotRmsAbvGrd_13',
       'TotRmsAbvGrd_14', 'TotRmsAbvGrd_15', 'TotRmsAbvGrd_2',
       'TotRmsAbvGrd_3', 'TotRmsAbvGrd_4', 'TotRmsAbvGrd_5', 'TotRmsAbvGrd_6',
       'TotRmsAbvGrd_7', 'TotRmsAbvGrd_8', 'TotRmsAbvGrd_9'],
      dtype='object')

In [15]:
# Variables explicatives
X = df_new

# Variable cible
y = df_train['SalePrice']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22, random_state=42)

'''
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train) # X_train_scaled
X_test = scaler.transform(X_test) # X_test_scaled

scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
'''

'\nscaler = MinMaxScaler()\n\nX_train = scaler.fit_transform(X_train) # X_train_scaled\nX_test = scaler.transform(X_test) # X_test_scaled\n\nscaler = StandardScaler().fit(X_train)\n\nX_train = scaler.transform(X_train)\nX_test = scaler.transform(X_test)\n'

### Linear

In [17]:
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)

y_pred_test = lr.predict(X_test)

In [18]:
mse_train = mean_squared_error(y_train, y_pred_train)

mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE train lr:", mse_train)
print("MSE test lr:", mse_test)

MSE train lr: 1220800218.633523
MSE test lr: 1270015797.5775862


In [19]:
mae_train = mean_absolute_error(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 22733.227529330063
MAE test lr: 22770.952426802043

Relative error 0.1258611646619617


### Ridge

In [20]:
ridge_reg = RidgeCV(alphas= (0.001, 0.01, 0.1, 0.3, 0.7, 1, 10, 50, 100))
ridge_reg.fit(X_train, y_train) 

In [21]:
print('alpha sélectionné par c-v :', ridge_reg.alpha_)
print('score train :', ridge_reg.score(X_train, y_train))
print('score test :', ridge_reg.score(X_test, y_test))

alpha sélectionné par c-v : 1.0
score train : 0.7975412013880919
score test : 0.820366601557315


In [22]:
ridge_pred_train = ridge_reg.predict(X_train)

ridge_pred_test = ridge_reg.predict(X_test)

In [23]:
mse_train = mean_squared_error(ridge_pred_train, y_train)

mae_test = mean_squared_error(ridge_pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 1225834592.9741492
mse test : 1290613605.2247238


In [24]:
mae_train = mean_absolute_error(ridge_pred_train, y_train)

mae_test = mean_absolute_error(ridge_pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 22749.810314200968
MAE test lr: 22914.231491844446

Relative error 0.12665310650347592


### ElasticNet

In [25]:
model_en = ElasticNetCV(cv=8, l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.99), 
                        alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1.0), max_iter=50000)

model_en.fit(X_train, y_train)

In [26]:
pred_train = model_en.predict(X_train)
pred_test = model_en.predict(X_test)


In [27]:
mse_train = mean_squared_error(pred_train, y_train)

mae_test = mean_squared_error(pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 1235040547.119731
mse test : 1318278098.357321


In [28]:
mae_train = mean_absolute_error(pred_train, y_train)

mae_test = mean_absolute_error(pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 22756.280064692455
MAE test lr: 23145.415507324356

Relative error 0.12793092259540548


### XGBoost

In [29]:
model_xgbr = XGBRegressor(learning_rate=0.09, max_depth=4, n_estimators=500, tree_method='auto', subsample=0.64)

In [30]:
model_xgbr.fit(X_train, y_train)

In [31]:
y_pred_train = model_xgbr.predict(X_train)

y_pred_test = model_xgbr.predict(X_test)

In [32]:
mse_train = mean_squared_error(y_train, y_pred_train)

mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE train lr:", mse_train)
print("MSE test lr:", mse_test)

MSE train lr: 51821241.039492026
MSE test lr: 895172439.8189751


In [33]:
mae_train = mean_absolute_error(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 5583.872109786907
MAE test lr: 20294.98180318323

Relative error 0.1121758105969876


#### save a model
#### https://www.geeksforgeeks.org/saving-a-machine-learning-model/

In [34]:
import joblib

model_file = "./model/model_xgbr.pkl"
joblib.dump(model_xgbr, model_file)

['./model/model_xgbr.pkl']

### GradientBoosting

In [41]:
model_gbr = GradientBoostingRegressor(max_leaf_nodes=1000, learning_rate=0.1, max_depth=4, n_estimators=5000)

In [42]:
model_gbr.fit(X_train, y_train)

In [43]:
y_pred_train = model_gbr.predict(X_train)

y_pred_test = model_gbr.predict(X_test)

In [44]:
mse_train = mean_squared_error(pred_train, y_train)

mae_test = mean_squared_error(pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 1235040547.119731
mse test : 1318278098.357321


In [45]:
mae_train = mean_absolute_error(pred_train, y_train)

mae_test = mean_absolute_error(pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 22756.280064692455
MAE test lr: 23145.415507324356

Relative error 0.12793092259540548
