In [1]:
import joblib
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from typing import List, Dict

In [2]:
model_file = "./model/model_xgbr.pkl"
model_xgbr = joblib.load(model_file)

In [3]:
model_xgbr

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.09, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.64,
             tree_method='auto', validate_parameters=1, verbosity=None)

In [4]:
df = pd.read_csv("./dataset/test.csv")
df.shape

(1459, 80)

In [5]:
df_id = df[['Id']]
df_id.shape

(1459, 1)

### variable numérique

In [6]:
df['GarageArea'] = df[['GarageArea']].fillna((df['GarageArea'].mean()), inplace=False)
df['TotalBsmtSF'] = df[['TotalBsmtSF']].fillna((df['TotalBsmtSF'].mean()), inplace=False)

In [7]:
df.loc[df['GarageArea'] == 0, 'GarageArea'] = df['GarageArea'].quantile(0.5)
df.loc[df['TotalBsmtSF'] == 0, 'TotalBsmtSF'] = df['TotalBsmtSF'].quantile(0.5)

In [8]:
df_num = df[['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea']] 
df_num.describe()

Unnamed: 0,GrLivArea,GarageArea,TotalBsmtSF,1stFlrSF,LotArea
count,1459.0,1459.0,1459.0,1459.0,1459.0
mean,1486.045922,497.772288,1073.882192,1156.534613,9819.161069
std,485.566099,186.558747,405.677201,398.16582,4955.517327
min,407.0,100.0,160.0,407.0,1470.0
25%,1117.5,368.5,816.0,873.5,7391.0
50%,1432.0,480.0,988.0,1079.0,9399.0
75%,1721.0,576.0,1304.0,1382.5,11517.5
max,5095.0,1488.0,5095.0,5095.0,56600.0


In [9]:
def qcut(df, variables, labels = ['micro-','micro','micro+', 'tiny-', 'tiny','tiny+', 'small-', 'small', 'small+','medium-', 'medium','medium+', 'large-', 'large', 'large+']) -> Dict:
    print('labels :', len(labels))
    df_dis_dic = {}
    for variable in variables:
        #print(variable)
        df_dis_dic[variable] = pd.qcut(x      = df[variable],
                  q   = len(labels),
                  labels = labels,
                  precision=0).to_frame()
    print('df_dis_dic :', len(df_dis_dic))
    return df_dis_dic

In [10]:
def dummies(df_dic: Dict) -> Dict:
    #df_train_dis_GrLivArea = pd.get_dummies(df_train_dis_GrLivArea, prefix = 'GrLivArea')
    df_dch_dic = {}
    for variable, df in df_dic.items():
        df_dch_dic[variable] = pd.get_dummies(df, prefix = variable)
    print('df_dch_dic : ', len(df_dch_dic))
    return df_dch_dic

In [11]:
def qcut_dummies(df, variables : List) -> Dict:
    return dummies(qcut(df, variables))

In [12]:
def join_list(df_list: List) -> pd.DataFrame:
    
    df_join = None
    
    for df in df_list:
        if df_join is None:
            df_join = df
        else:
            df_join = df_join.join(df)
        
    return df_join



In [13]:
def join_dict(df_dic: Dict) -> pd.DataFrame:
    print(type(df_dic))
    return join_list(list(df_dic.values()))

In [14]:
df_dic = qcut_dummies(df_num, ['GrLivArea', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'LotArea'])

df_num = join_dict(df_dic)



df_num.shape

labels : 15
df_dis_dic : 5
df_dch_dic :  5
<class 'dict'>


(1459, 75)

### Variable cétégorielle

In [15]:
df_cat = df[['OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd']].astype(str)
df_cat.describe()

Unnamed: 0,OverallQual,GarageCars,FullBath,TotRmsAbvGrd
count,1459,1459.0,1459,1459
unique,10,7.0,5,12
top,5,2.0,2,6
freq,428,770.0,762,442


In [16]:
df_cat = pd.get_dummies(df_cat)
df_cat.head()

Unnamed: 0,OverallQual_1,OverallQual_10,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,...,TotRmsAbvGrd_12,TotRmsAbvGrd_13,TotRmsAbvGrd_15,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [17]:
df_new = df_num.join(df_cat)
df_new.shape

(1459, 109)

In [18]:
df_new.tail()

Unnamed: 0,GrLivArea_micro-,GrLivArea_micro,GrLivArea_micro+,GrLivArea_tiny-,GrLivArea_tiny,GrLivArea_tiny+,GrLivArea_small-,GrLivArea_small,GrLivArea_small+,GrLivArea_medium-,...,TotRmsAbvGrd_12,TotRmsAbvGrd_13,TotRmsAbvGrd_15,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9
1454,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1455,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1456,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1457,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Missing variables (difference between train and test)

In [19]:
model_xbgr_columns = ['GrLivArea_micro-', 'GrLivArea_micro', 'GrLivArea_micro+', 'GrLivArea_tiny-', 'GrLivArea_tiny', 'GrLivArea_tiny+', 'GrLivArea_small-', 'GrLivArea_small', 'GrLivArea_small+', 'GrLivArea_medium-', 'GrLivArea_medium', 'GrLivArea_medium+', 'GrLivArea_large-', 'GrLivArea_large', 'GrLivArea_large+', 'GarageArea_micro-', 'GarageArea_micro', 'GarageArea_micro+', 'GarageArea_tiny-', 'GarageArea_tiny', 'GarageArea_tiny+', 'GarageArea_small-', 'GarageArea_small', 'GarageArea_small+', 'GarageArea_medium-', 'GarageArea_medium', 'GarageArea_medium+', 'GarageArea_large-', 'GarageArea_large', 'GarageArea_large+', 'TotalBsmtSF_micro-', 'TotalBsmtSF_micro', 'TotalBsmtSF_micro+', 'TotalBsmtSF_tiny-', 'TotalBsmtSF_tiny', 'TotalBsmtSF_tiny+', 'TotalBsmtSF_small-', 'TotalBsmtSF_small', 'TotalBsmtSF_small+', 'TotalBsmtSF_medium-', 'TotalBsmtSF_medium', 'TotalBsmtSF_medium+', 'TotalBsmtSF_large-', 'TotalBsmtSF_large', 'TotalBsmtSF_large+', '1stFlrSF_micro-', '1stFlrSF_micro', '1stFlrSF_micro+', '1stFlrSF_tiny-', '1stFlrSF_tiny', '1stFlrSF_tiny+', '1stFlrSF_small-', '1stFlrSF_small', '1stFlrSF_small+', '1stFlrSF_medium-', '1stFlrSF_medium', '1stFlrSF_medium+', '1stFlrSF_large-', '1stFlrSF_large', '1stFlrSF_large+', 'LotArea_micro-', 'LotArea_micro', 'LotArea_micro+', 'LotArea_tiny-', 'LotArea_tiny', 'LotArea_tiny+', 'LotArea_small-', 'LotArea_small', 'LotArea_small+', 'LotArea_medium-', 'LotArea_medium', 'LotArea_medium+', 'LotArea_large-', 'LotArea_large', 'LotArea_large+', 'OverallQual_1', 'OverallQual_10', 'OverallQual_2', 'OverallQual_3', 'OverallQual_4', 'OverallQual_5', 'OverallQual_6', 'OverallQual_7', 'OverallQual_8', 'OverallQual_9', 'GarageCars_0', 'GarageCars_0.0', 'GarageCars_1', 'GarageCars_1.0', 'GarageCars_2', 'GarageCars_2.0', 'GarageCars_3', 'GarageCars_3.0', 'GarageCars_4', 'GarageCars_4.0', 'GarageCars_5.0', 'GarageCars_nan', 'FullBath_0', 'FullBath_1', 'FullBath_2', 'FullBath_3', 'FullBath_4', 'TotRmsAbvGrd_10', 'TotRmsAbvGrd_11', 'TotRmsAbvGrd_12', 'TotRmsAbvGrd_13', 'TotRmsAbvGrd_14', 'TotRmsAbvGrd_15', 'TotRmsAbvGrd_2', 'TotRmsAbvGrd_3', 'TotRmsAbvGrd_4', 'TotRmsAbvGrd_5', 'TotRmsAbvGrd_6', 'TotRmsAbvGrd_7', 'TotRmsAbvGrd_8', 'TotRmsAbvGrd_9']

for model_xbgr_column in model_xbgr_columns:
    if model_xbgr_column not in df_new.columns:
        df_new[model_xbgr_column] = 0
        print(model_xbgr_column)

df_new.shape

GarageCars_0
GarageCars_1
GarageCars_2
GarageCars_3
GarageCars_4
TotRmsAbvGrd_14
TotRmsAbvGrd_2


(1459, 116)

In [20]:
pred = model_xgbr.predict(df_new)

In [21]:
df_pred = pd.DataFrame(pred, columns=['SalePrice'])

df_pred.shape

(1459, 1)

In [22]:
df_pred = df_id.join(df_pred)

df_pred.shape

(1459, 2)

In [23]:
df_pred.head()

Unnamed: 0,Id,SalePrice
0,1461,127110.75
1,1462,178178.671875
2,1463,146484.109375
3,1464,171336.546875
4,1465,207622.25


In [24]:
df_pred.to_csv('./submission/model_xgbr.csv', index=False)

Ranking Kaggle
3444
Fernando Gonçalves
0.20832
1
1s
🙂
Your First Entry!
Welcome to the leaderboard!

With hot one points for numerical (a01-b and a02-b), Score: 0.22209