In [153]:

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import arange
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from xgboost import XGBRegressor

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from typing import List, Dict

In [2]:
df_train = pd.read_csv("./dataset/train.csv")
df_train.shape

(1460, 81)

In [3]:
df_test = pd.read_csv("./dataset/test.csv")
df_test.shape

(1459, 80)

### variable numérique

In [4]:
#df[['GrLivArea']].fillna((df['GrLivArea'].mean()), inplace=False)
df_train_num = df_train[['GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea']] # 'GrLivArea' (discrétisée), 

df_train_num.head()

Unnamed: 0,GarageArea,TotalBsmtSF,1stFlrSF,LotArea
0,548,856,856,8450
1,460,1262,1262,9600
2,608,920,920,11250
3,642,756,961,9550
4,836,1145,1145,14260


#### Suppression des outliers extrèmes

In [5]:
df_train = df_train.drop(df_train[(df_train.GrLivArea > 4000) & (df_train.SalePrice < 300000)].index)

### Discretisation et dichotomisation

In [165]:
def qcut(df, variables, labels = ['micro-','micro','micro+', 'tiny-', 'tiny','tiny+', 'small-', 'small', 'small+','medium-', 'medium','medium+', 'large-', 'large', 'large+']) -> Dict:
    print('labels :', len(labels))
    df_dis_dic = {}
    for variable in variables:
        #print(variable)
        df_dis_dic[variable] = pd.qcut(x      = df[variable],
                  q   = len(labels),
                  labels = labels,
                  precision=0).to_frame()
    print('df_dis_dic :', len(df_dis_dic))
    return df_dis_dic

In [166]:
def dummies(df_dic: Dict) -> Dict:
    #df_train_dis_GrLivArea = pd.get_dummies(df_train_dis_GrLivArea, prefix = 'GrLivArea')
    df_dch_dic = {}
    for variable, df in df_dic.items():
        df_dch_dic[variable] = pd.get_dummies(df, prefix = variable)
    print('df_dch_dic : ', len(df_dch_dic))
    return df_dch_dic

In [191]:
def qcut_dummies(df, variables : List) -> Dict:
    return dummies(qcut(df, variables))

In [210]:
def join_list(df_list: List) -> pd.DataFrame:
    
    df_join = None
    
    for df in df_list:
        if df_join is None:
            df_join = df
        else:
            df_join = df_join.join(df)
        
    return df_join



In [211]:
def join_dict(df_dic: Dict) -> pd.DataFrame:
    print(type(df_dic))
    return join_list(list(df_dic.values()))

In [216]:
df_dic = qcut_dummies(df_train, ['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea'])

df_train_dis = join_dict(df_dic)



df_joined.shape

labels : 15
df_dis_dic : 5
df_dch_dic :  5
<class 'dict'>


(1458, 75)

### Variable catégorielle

In [217]:
df_train_cat = df_train[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_test_cat = df_test[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_train_test_cat = pd.concat([df_train_cat, df_test_cat])
df_train_test_cat = pd.get_dummies(df_train_test_cat)
#df_new = df_train_num.join(df_train_test_cat.head(1460))

In [218]:
df_new = df_train_dis.join(
    df_train_test_cat.head(1458)) # 2 outliers removed

In [219]:
df_new.head()

Unnamed: 0,GrLivArea_micro-,GrLivArea_micro,GrLivArea_micro+,GrLivArea_tiny-,GrLivArea_tiny,GrLivArea_tiny+,GrLivArea_small-,GrLivArea_small,GrLivArea_small+,GrLivArea_medium-,...,TotRmsAbvGrd_14,TotRmsAbvGrd_15,TotRmsAbvGrd_2,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [220]:
# Variables explicatives
X = df_new

# Variable cible
y = df_train['SalePrice']

In [221]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=33)

### Linear

In [222]:
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)

y_pred_test = lr.predict(X_test)

In [223]:
mse_train = mean_squared_error(y_train, y_pred_train)

mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE train lr:", mse_train)
print("MSE test lr:", mse_test)

MSE train lr: 891839411.4692622
MSE test lr: 1013095765.93361


In [224]:
mae_train = mean_absolute_error(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 20455.233606557376
MAE test lr: 22234.7510373444

Relative error 0.12288947280557797


### Ridge

In [53]:
ridge_reg = RidgeCV(alphas= (0.001, 0.01, 0.1, 0.3, 0.7, 1, 10, 50, 100))
ridge_reg.fit(X_train, y_train) 

RidgeCV(alphas=array([1.e-03, 1.e-02, 1.e-01, 3.e-01, 7.e-01, 1.e+00, 1.e+01, 5.e+01,
       1.e+02]))

In [54]:
print('alpha sélectionné par c-v :', ridge_reg.alpha_)
print('score train :', ridge_reg.score(X_train, y_train))
print('score test :', ridge_reg.score(X_test, y_test))

alpha sélectionné par c-v : 1.0
score train : 0.8649338783173685
score test : 0.820820122954007


In [55]:
ridge_pred_train = ridge_reg.predict(X_train)

ridge_pred_test = ridge_reg.predict(X_test)

In [56]:
mse_train = mean_squared_error(ridge_pred_train, y_train)

mae_test = mean_squared_error(ridge_pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 899869805.863024
mse test : 1003598353.7782211


In [57]:
mae_train = mean_absolute_error(ridge_pred_train, y_train)

mae_test = mean_absolute_error(ridge_pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 20463.672961405857
MAE test lr: 22345.857273945247

Relative error 0.12350354700044337


### ElasticNet

In [58]:
model_en = ElasticNetCV(cv=8, l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.99), 
                        alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1.0), max_iter=50000)

model_en.fit(X_train, y_train)

ElasticNetCV(alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1.0),
             cv=8, l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.99),
             max_iter=50000)

In [59]:
pred_train = model_en.predict(X_train)
pred_test = model_en.predict(X_test)


In [60]:
mse_train = mean_squared_error(pred_train, y_train)

mae_test = mean_squared_error(pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 899612476.4021239
mse test : 1003711518.1023433


In [61]:
mae_train = mean_absolute_error(pred_train, y_train)

mae_test = mean_absolute_error(pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 20463.727012909345
MAE test lr: 22344.033623116793

Relative error 0.12349346784603711


### XGBoost

In [62]:
model_xgbr = XGBRegressor(learning_rate=0.09, max_depth=4, n_estimators=500, tree_method='auto', subsample=0.64)

In [63]:
model_xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.09, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.64,
             tree_method='auto', validate_parameters=1, verbosity=None)

In [64]:
y_pred_train = model_xgbr.predict(X_train)

y_pred_test = model_xgbr.predict(X_test)

In [65]:
mse_train = mean_squared_error(y_train, y_pred_train)

mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE train lr:", mse_train)
print("MSE test lr:", mse_test)

MSE train lr: 108975611.5185746
MSE test lr: 1087185521.6175606


In [66]:
mae_train = mean_absolute_error(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df_train['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 8016.6101314357065
MAE test lr: 22844.366685036308

Relative error 0.12625876376067188


#### save a model
#### https://www.geeksforgeeks.org/saving-a-machine-learning-model/

In [67]:
import joblib

model_file = "./model/model_xgbr.pkl"
joblib.dump(model_xgbr, model_file)

['./model/model_en.pkl']