In [1]:

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import arange
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv("./dataset/train.csv")
df.shape

(1460, 81)

### variable numérique

In [28]:
df['MasVnrArea'].isna().sum()
df['MasVnrArea'].describe()
#df['MasVnrArea'].unique()

count    1452.000000
mean      103.685262
std       181.066207
min         0.000000
25%         0.000000
50%         0.000000
75%       166.000000
max      1600.000000
Name: MasVnrArea, dtype: float64

In [4]:
df.loc[df['TotalBsmtSF'] == 0, 'TotalBsmtSF'] = df['TotalBsmtSF'].quantile(0.5)
#df['TotalBsmtSF'].describe()

In [5]:
#df[['GrLivArea']].fillna((df['GrLivArea'].mean()), inplace=False)
df_num = df[['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF']] 

df_num.head()

Unnamed: 0,GrLivArea,GarageArea,TotalBsmtSF,1stFlrSF
0,1710,548,856.0,856
1,1262,460,1262.0,1262
2,1786,608,920.0,920
3,1717,642,756.0,961
4,2198,836,1145.0,1145


In [6]:
scaler = MinMaxScaler()
df_num_scaled = scaler.fit_transform(df_num)

scaler = StandardScaler().fit(df_num_scaled)
df_num_scaled = scaler.transform(df_num_scaled)

In [7]:
df_num = pd.DataFrame(df_num_scaled, columns = df_num.columns.tolist())

### Variable cétégorielle

In [8]:
df_cat = df[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_cat.describe()

Unnamed: 0,OverallQual,GarageCars,FullBath,TotRmsAbvGrd
count,1460,1460,1460,1460
unique,10,5,4,12
top,5,2,2,6
freq,397,824,768,402


In [9]:
df_cat['OverallQual'].unique()

array(['7', '6', '8', '5', '9', '4', '10', '3', '1', '2'], dtype=object)

In [10]:
df['GarageCars'].unique()

array([2, 3, 1, 0, 4], dtype=int64)

In [11]:
df_cat = pd.get_dummies(df_cat)
df_cat.head()

Unnamed: 0,OverallQual_1,OverallQual_10,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,...,TotRmsAbvGrd_12,TotRmsAbvGrd_14,TotRmsAbvGrd_2,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
df_new = df_num.join(df_cat)
df_new.shape

(1460, 35)

In [13]:
# Variables explicatives
X = df_new

# Variable cible
y = df['SalePrice']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

'''
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train) # X_train_scaled
X_test = scaler.transform(X_test) # X_test_scaled

scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
'''

'\nscaler = MinMaxScaler()\n\nX_train = scaler.fit_transform(X_train) # X_train_scaled\nX_test = scaler.transform(X_test) # X_test_scaled\n\nscaler = StandardScaler().fit(X_train)\n\nX_train = scaler.transform(X_train)\nX_test = scaler.transform(X_test)\n'

### Linear

In [15]:
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)

y_pred_test = lr.predict(X_test)

In [16]:
mse_train = mean_squared_error(y_train, y_pred_train)

mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE train lr:", mse_train)
print("MSE test lr:", mse_test)

MSE train lr: 1310455467.3429494
MSE test lr: 1235361117.1129994


In [17]:
mae_train = mean_absolute_error(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 23465.586857447288
MAE test lr: 23308.27244191951

Relative error 0.12883107657567103


### Ridge

In [18]:
ridge_reg = RidgeCV(alphas= (0.001, 0.01, 0.1, 0.3, 0.7, 1, 10, 50, 100))
ridge_reg.fit(X_train, y_train) 

RidgeCV(alphas=array([1.e-03, 1.e-02, 1.e-01, 3.e-01, 7.e-01, 1.e+00, 1.e+01, 5.e+01,
       1.e+02]))

In [19]:
print('alpha sélectionné par c-v :', ridge_reg.alpha_)
print('score train :', ridge_reg.score(X_train, y_train))
print('score test :', ridge_reg.score(X_test, y_test))

alpha sélectionné par c-v : 1.0
score train : 0.7810517535258238
score test : 0.8266841533774373


In [20]:
ridge_pred_train = ridge_reg.predict(X_train)

ridge_pred_test = ridge_reg.predict(X_test)

In [21]:
mse_train = mean_squared_error(ridge_pred_train, y_train)

mae_test = mean_squared_error(ridge_pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 1317756544.9717116
mse test : 1209414227.9888186


In [22]:
mae_train = mean_absolute_error(ridge_pred_train, y_train)

mae_test = mean_absolute_error(ridge_pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 23488.588565964026
MAE test lr: 22944.96282172709

Relative error 0.12682296681050848


### ElasticNet

In [23]:
model_en = ElasticNetCV(cv=8, l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.99), 
                        alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1.0), max_iter=50000)

model_en.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent_gram(


ElasticNetCV(alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1.0),
             cv=8, l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.99),
             max_iter=50000)

In [24]:
pred_train = model_en.predict(X_train)
pred_test = model_en.predict(X_test)


In [25]:
mse_train = mean_squared_error(pred_train, y_train)

mae_test = mean_squared_error(pred_test, y_test)

print('mse train :', mse_train)
print('mse test :', mae_test)

mse train : 1338143325.706984
mse test : 1219384920.324139


In [26]:
mae_train = mean_absolute_error(pred_train, y_train)

mae_test = mean_absolute_error(pred_test, y_test)

print("MAE train lr:", mae_train)
print("MAE test lr:", mae_test)

mean_price = df['SalePrice'].mean()

print("\nRelative error", mae_test / mean_price)

MAE train lr: 23551.522434744795
MAE test lr: 22957.513420798234

Relative error 0.12689233733953564


In [29]:
# save a model
# https://www.geeksforgeeks.org/saving-a-machine-learning-model/