In [1]:
import joblib
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
model_file = "./model/model_xgbr.pkl"
model_xgbr = joblib.load(model_file)

In [3]:
model_xgbr

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.09, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.64,
             tree_method='auto', validate_parameters=1, verbosity=None)

In [4]:
df = pd.read_csv("./dataset/test.csv")
df.shape

(1459, 80)

In [5]:
df_id = df[['Id']]
df_id.shape

(1459, 1)

### variable numérique

In [6]:
df['GarageArea'] = df[['GarageArea']].fillna((df['GarageArea'].mean()), inplace=False)
df['TotalBsmtSF'] = df[['TotalBsmtSF']].fillna((df['TotalBsmtSF'].mean()), inplace=False)

In [7]:
df.loc[df['GarageArea'] == 0, 'GarageArea'] = df['GarageArea'].quantile(0.5)
df.loc[df['TotalBsmtSF'] == 0, 'TotalBsmtSF'] = df['TotalBsmtSF'].quantile(0.5)

In [8]:
df_num = df[['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea']] 
df_num.describe()

Unnamed: 0,GrLivArea,GarageArea,TotalBsmtSF,1stFlrSF,LotArea
count,1459.0,1459.0,1459.0,1459.0,1459.0
mean,1486.045922,497.772288,1073.882192,1156.534613,9819.161069
std,485.566099,186.558747,405.677201,398.16582,4955.517327
min,407.0,100.0,160.0,407.0,1470.0
25%,1117.5,368.5,816.0,873.5,7391.0
50%,1432.0,480.0,988.0,1079.0,9399.0
75%,1721.0,576.0,1304.0,1382.5,11517.5
max,5095.0,1488.0,5095.0,5095.0,56600.0


In [9]:
scaler = MinMaxScaler()
df_num_scaled = scaler.fit_transform(df_num)

scaler = StandardScaler().fit(df_num_scaled)
df_num_scaled = scaler.transform(df_num_scaled)

df_num = pd.DataFrame(df_num_scaled, columns = df_num.columns.tolist())

### Variable cétégorielle

In [10]:
df_cat = df[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_cat.describe()

Unnamed: 0,OverallQual,GarageCars,FullBath,TotRmsAbvGrd
count,1459,1459.0,1459,1459
unique,10,7.0,5,12
top,5,2.0,2,6
freq,428,770.0,762,442


In [11]:
df_cat = pd.get_dummies(df_cat)
df_cat.head()

Unnamed: 0,OverallQual_1,OverallQual_10,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,...,TotRmsAbvGrd_12,TotRmsAbvGrd_13,TotRmsAbvGrd_15,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
df_new = df_num.join(df_cat)
df_new.shape

(1459, 39)

In [13]:
df_new.tail()

Unnamed: 0,GrLivArea,GarageArea,TotalBsmtSF,1stFlrSF,LotArea,OverallQual_1,OverallQual_10,OverallQual_2,OverallQual_3,OverallQual_4,...,TotRmsAbvGrd_12,TotRmsAbvGrd_13,TotRmsAbvGrd_15,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
1454,-0.811797,-0.095296,-1.301683,-1.533893,-1.59133,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1455,-0.811797,-1.13554,-1.301683,-1.533893,-1.599808,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1456,-0.539856,0.419463,0.370169,0.169499,2.05515,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1457,-1.063136,-0.095296,-0.399179,-0.468645,0.125527,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1458,1.058827,0.816257,-0.192047,-0.403324,-0.03879,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Missing variables (difference between train and test)

In [20]:
model_xbgr_columns = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea',
       'OverallQual_1', 'OverallQual_10', 'OverallQual_2', 'OverallQual_3',
       'OverallQual_4', 'OverallQual_5', 'OverallQual_6', 'OverallQual_7',
       'OverallQual_8', 'OverallQual_9', 'GarageCars_0', 'GarageCars_0.0',
       'GarageCars_1', 'GarageCars_1.0', 'GarageCars_2', 'GarageCars_2.0',
       'GarageCars_3', 'GarageCars_3.0', 'GarageCars_4', 'GarageCars_4.0',
       'GarageCars_5.0', 'GarageCars_nan', 'FullBath_0', 'FullBath_1',
       'FullBath_2', 'FullBath_3', 'FullBath_4', 'TotRmsAbvGrd_10',
       'TotRmsAbvGrd_11', 'TotRmsAbvGrd_12', 'TotRmsAbvGrd_13',
       'TotRmsAbvGrd_14', 'TotRmsAbvGrd_15', 'TotRmsAbvGrd_2',
       'TotRmsAbvGrd_3', 'TotRmsAbvGrd_4', 'TotRmsAbvGrd_5', 'TotRmsAbvGrd_6',
       'TotRmsAbvGrd_7', 'TotRmsAbvGrd_8', 'TotRmsAbvGrd_9']

for model_xbgr_column in model_xbgr_columns:
    if model_xbgr_column not in df_new.columns:
        df_new[model_xbgr_column] = 0
        print(model_xbgr_column)

df_new.shape

(1459, 46)

In [21]:
pred = model_xgbr.predict(df_new)

In [28]:
df_pred = pd.DataFrame(pred, columns=['SalePrice'])

df_pred.shape

(1459, 1)

In [29]:
df_pred = df_id.join(df_pred)

df_pred.shape

(1459, 2)

In [30]:
df_pred.head()

Unnamed: 0,Id,SalePrice
0,1461,133051.5
1,1462,168287.921875
2,1463,129371.929688
3,1464,154468.03125
4,1465,196063.84375


In [31]:
df_pred.to_csv('./submission/model_xgbr.csv', index=False)

Ranking Kaggle
3444
Fernando Gonçalves
0.20832
1
1s
🙂
Your First Entry!
Welcome to the leaderboard!