Kaggle House Prices Competition - Beginner Data Science Guide
https://youtu.be/NFOfBtpV7L0

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn import neighbors
from sklearn.decomposition import PCA
from sklearn import linear_model, preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn import model_selection
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def df_corr_columns(df, target, threshold):
    df_corr = df.corr()
    df_corr_SalePrice = df_corr[[target]]
    df_corr_SalePrice = df_corr_SalePrice.reset_index()
    df_corr_SalePrice = df_corr_SalePrice[(df_corr_SalePrice['index'] != target)]
    df_corr_SalePrice_columns = df_corr_SalePrice[(
        (df_corr_SalePrice[target] > threshold) | 
        (df_corr_SalePrice[target] < -threshold))]['index']
    return df_corr_SalePrice_columns

def separe_per_unique_threshold(df, threshold):
    dic_c_unique_len = {}
    for c in df.columns:
        dic_c_unique_len[c] = len(df[c].unique())
        #print(c, len(df_num[c].unique()))
 

    df_c_unique_len = pd.DataFrame.from_dict(dic_c_unique_len, orient="index").reset_index()
    df_c_unique_len.columns=['variable', 'count']
    return df_c_unique_len[df_c_unique_len['count'] <= 20].sort_values('count'), df_c_unique_len[df_c_unique_len['count'] > 20].sort_values('count')

def dummies(df, quantiles = 4):
    
    columns = df.columns
    for c in columns:
        
        df[c] = pd.qcut(df[c], q=quantiles, duplicates='drop') # , labels=dummies_cols
        
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1)
        
    df.columns = [c.replace(',', '_') for c in df.columns]
    df.columns = [c.replace('(', '_') for c in df.columns]
    df.columns = [c.replace(']', '_') for c in df.columns]
    df.columns = [c.replace('.', '_') for c in df.columns]
           
        
    df.drop(columns, axis=1, inplace=True)
        
    return df

def encode_cat(df):
    
    columns = df.columns
    for c in columns:
        
        le = preprocessing.LabelEncoder()
        le.fit(df[c].unique())
        df[c] = le.transform(df[c])
        
    return df
    
    

corr_threshold = 0
dummies_quantiles = 4
variance_threshold = 0.1
unique_values_threshold = 20
pca_components = 0.999


In [3]:
df_train = pd.read_csv("./dataset/train.csv")

df_train_Id = df_train[['Id']]

# Outliers
df_train.drop(df_train[(df_train['YearBuilt'] < 1900) & (df_train['SalePrice'] > 250000)].index, inplace=True)
df_train.drop(df_train[(df_train['YearBuilt'] > 1980) & (df_train['SalePrice'] > 700000)].index, inplace=True)
df_train.drop(df_train[(df_train['GrLivArea'] > 4000)].index, inplace=True)

df_train_SalePrice = df_train[['SalePrice']]

df_train.drop(['Id'], axis=1, inplace=True)

df_test = pd.read_csv("./dataset/test.csv")
df_test_Id = df_test[['Id']]
df_test.drop(['Id'], axis = 1, inplace=True)

# Concat train and test to process all datas
df_train = pd.concat([df_train, df_test])

## Fill nan
df_train_num =  df_train.select_dtypes(include='number')
df_train_num.fillna(df_train_num.median(), inplace=True)


df_train_cat = df_train.select_dtypes(include='object')
df_train_cat.fillna(df_train_cat.mode().iloc[0], inplace=True)

df_train = pd.concat([df_train_num, df_train_cat], axis=1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [4]:
df_train = df_train[df_corr_columns(df_train, 'SalePrice', corr_threshold)]

df_train_cat_cols, df_train_scaled_cols = separe_per_unique_threshold(df_train, unique_values_threshold)
# transfert à la main de certaines variables pour discretisation


df_train_dummies_cols = df_train_scaled_cols[
    (df_train_scaled_cols['variable'] == 'YearBuilt') |
    (df_train_scaled_cols['variable'] == 'YearRemodAdd') |
    (df_train_scaled_cols['variable'] == 'GarageYrBlt')]

df_train_scaled_cols = df_train_scaled_cols[
    (df_train_scaled_cols['variable'] != 'YearBuilt') &
    (df_train_scaled_cols['variable'] != 'YearRemodAdd') &
    (df_train_scaled_cols['variable'] != 'GarageYrBlt')]

df_train_dummies_cols

Unnamed: 0,variable,count
6,YearRemodAdd,61
24,GarageYrBlt,103
5,YearBuilt,117


In [5]:
df_train_scaled_cols

Unnamed: 0,variable,count
30,3SsnPorch,31
14,LowQualFinSF,35
33,MiscVal,38
31,ScreenPorch,120
1,LotFrontage,128
29,EnclosedPorch,182
28,OpenPorchSF,248
9,BsmtFinSF2,272
27,WoodDeckSF,378
7,MasVnrArea,441


In [6]:
df_train_cat_cols

Unnamed: 0,variable,count
17,BsmtHalfBath,3
19,HalfBath,3
16,BsmtFullBath,4
21,KitchenAbvGr,4
18,FullBath,5
23,Fireplaces,5
35,YrSold,5
25,GarageCars,6
20,BedroomAbvGr,8
4,OverallCond,9


In [7]:
df_train[df_train_cat_cols['variable'].tolist()].head()

Unnamed: 0,BsmtHalfBath,HalfBath,BsmtFullBath,KitchenAbvGr,FullBath,Fireplaces,YrSold,GarageCars,BedroomAbvGr,OverallCond,OverallQual,PoolArea,MoSold,TotRmsAbvGrd,MSSubClass
0,0.0,1,1.0,1,2,0,2008,2.0,3,5,7,0,2,8,60
1,1.0,0,0.0,1,2,1,2007,2.0,3,8,6,0,5,6,20
2,0.0,1,1.0,1,2,1,2008,2.0,3,5,7,0,9,6,60
3,0.0,0,1.0,1,1,1,2006,3.0,3,5,7,0,2,7,70
4,0.0,1,1.0,1,2,1,2008,3.0,4,5,8,0,12,9,60


In [8]:
df_train_cat = encode_cat(df_train[df_train_cat_cols['variable'].tolist()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = le.transform(df[c])


In [9]:
df_train_cat.head()

Unnamed: 0,BsmtHalfBath,HalfBath,BsmtFullBath,KitchenAbvGr,FullBath,Fireplaces,YrSold,GarageCars,BedroomAbvGr,OverallCond,OverallQual,PoolArea,MoSold,TotRmsAbvGrd,MSSubClass
0,0,1,1,1,2,0,2,2,3,4,6,0,1,6,5
1,1,0,0,1,2,1,1,2,3,7,5,0,4,4,0
2,0,1,1,1,2,1,2,2,3,4,6,0,8,4,5
3,0,0,1,1,1,1,0,3,3,4,6,0,1,5,6
4,0,1,1,1,2,1,2,3,4,4,7,0,11,7,5


In [10]:
df_train_dummies = dummies(df_train[df_train_dummies_cols['variable'].tolist()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.qcut(df[c], q=quantiles, duplicates='drop') # , labels=dummies_cols


In [11]:
df_train_dummies.head()

Unnamed: 0,YearRemodAdd__1949_999_ 1965_0_,YearRemodAdd__1965_0_ 1993_0_,YearRemodAdd__1993_0_ 2004_0_,YearRemodAdd__2004_0_ 2010_0_,GarageYrBlt__1894_999_ 1961_0_,GarageYrBlt__1961_0_ 1979_0_,GarageYrBlt__1979_0_ 2001_0_,GarageYrBlt__2001_0_ 2207_0_,YearBuilt__1871_999_ 1954_0_,YearBuilt__1954_0_ 1973_0_,YearBuilt__1973_0_ 2001_0_,YearBuilt__2001_0_ 2010_0_
0,0,0,1,0,0,0,0,1,0,0,0,1
1,0,1,0,0,0,1,0,0,0,0,1,0
2,0,0,1,0,0,0,1,0,0,0,1,0
3,0,1,0,0,0,0,1,0,1,0,0,0
4,0,0,1,0,0,0,1,0,0,0,1,0


In [12]:
# Scaled
df_train_scaled = df_train[df_train_scaled_cols['variable'].tolist()]

scaler = preprocessing.StandardScaler().fit(df_train_scaled)
df_train_scaled[df_train_scaled.columns] = pd.DataFrame(scaler.transform(df_train_scaled), index=df_train_scaled.index)

df_train_scaled.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,3SsnPorch,LowQualFinSF,MiscVal,ScreenPorch,LotFrontage,EnclosedPorch,OpenPorchSF,BsmtFinSF2,WoodDeckSF,MasVnrArea,GarageArea,2ndFlrSF,BsmtFinSF1,TotalBsmtSF,1stFlrSF,BsmtUnfSF,GrLivArea,LotArea
0,-0.103474,-0.099696,-0.089716,-0.286275,-0.189292,-0.360063,0.2094,-0.293462,-0.738938,0.538113,0.356719,1.227949,0.60381,-0.44898,-0.782325,-0.932996,0.443045,-0.214575
1,-0.103474,-0.099696,-0.089716,-0.286275,0.533844,-0.360063,-0.70769,-0.293462,1.617934,-0.56904,-0.054343,-0.785512,1.218183,0.500077,0.27523,-0.627798,-0.475947,-0.06723
2,-0.103474,-0.099696,-0.089716,-0.286275,-0.044665,-0.360063,-0.076251,-0.293462,-0.738938,0.346056,0.636989,1.256241,0.106891,-0.299375,-0.615616,-0.286158,0.598945,0.144178
3,-0.103474,-0.099696,-0.089716,-0.286275,-0.430338,3.892679,-0.181491,-0.293462,-0.738938,-0.56904,0.795808,0.996896,-0.502964,-0.682738,-0.508819,-0.044732,0.457404,-0.073636
4,-0.103474,-0.099696,-0.089716,-0.286275,0.72668,-0.360063,0.555188,-0.293462,0.779584,1.408018,1.702013,1.697128,0.488616,0.226581,-0.029533,-0.158612,1.444089,0.529839


In [13]:
df_train = pd.concat([df_train_cat, df_train_scaled, df_train_dummies], axis=1)
df_train.shape

(2911, 45)

In [14]:
df_test = df_train.tail(1459) 
df_train = df_train.head(df_train_SalePrice.shape[0]) # shape minus outliers 

print(df_train.shape, df_test.shape)

(1452, 45) (1459, 45)


In [15]:
target = df_train_SalePrice
data = df_train

In [16]:
# centrer la distribution de la cible
target = np.log1p(target['SalePrice'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3, random_state = 66) 

In [18]:
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
print(xgboost.__version__)

1.5.2


In [20]:
# create an xgboost regression model
#model_xgbr = XGBRegressor()

# Instantiation
'''
xgb_r = XGBRegressor(objective ='reg:squarederror', learning_rate=0.3, subsample=0.8, colsample_bytree=1, gamma=0.1,
                     max_depth=4, 
                  n_estimators = 830, seed = 123)
'''

xgb_r = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

  
# Fitting the model
eval_set = [(X_test, y_test)]
xgb_r.fit(X_train, y_train, eval_metric="rmse", eval_set=eval_set, verbose=True)
  
# Predict the model
pred = xgb_r.predict(X_test)
  
# RMSE Computation
rmse = np.sqrt(mean_squared_error(y_test, pred))
print("RMSE : % f" %(rmse))

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:10.95059
[1]	validation_0-rmse:10.40475
[2]	validation_0-rmse:9.88477
[3]	validation_0-rmse:9.39259
[4]	validation_0-rmse:8.92543
[5]	validation_0-rmse:8.48059
[6]	validation_0-rmse:8.05752
[7]	validation_0-rmse:7.65574
[8]	validation_0-rmse:7.27372
[9]	validation_0-rmse:6.91169
[10]	validation_0-rmse:6.56744
[11]	validation_0-rmse:6.24151
[12]	validation_0-rmse:5.93016
[13]	validation_0-rmse:5.63530
[14]	validation_0-rmse:5.35551
[15]	validation_0-rmse:5.08840
[16]	validation_0-rmse:4.83561
[17]	validation_0-rmse:4.59547
[18]	validation_0-rmse:4.36737
[19]	validation_0-rmse:4.15131
[20]	validation_0-rmse:3.94595
[21]	validation_0-rmse:3.75041
[22]	validati

[247]	validation_0-rmse:0.13970
[248]	validation_0-rmse:0.13970
[249]	validation_0-rmse:0.13970
[250]	validation_0-rmse:0.13968
[251]	validation_0-rmse:0.13959
[252]	validation_0-rmse:0.13952
[253]	validation_0-rmse:0.13952
[254]	validation_0-rmse:0.13950
[255]	validation_0-rmse:0.13952
[256]	validation_0-rmse:0.13954
[257]	validation_0-rmse:0.13953
[258]	validation_0-rmse:0.13953
[259]	validation_0-rmse:0.13953
[260]	validation_0-rmse:0.13957
[261]	validation_0-rmse:0.13957
[262]	validation_0-rmse:0.13957
[263]	validation_0-rmse:0.13955
[264]	validation_0-rmse:0.13947
[265]	validation_0-rmse:0.13947
[266]	validation_0-rmse:0.13944
[267]	validation_0-rmse:0.13944
[268]	validation_0-rmse:0.13944
[269]	validation_0-rmse:0.13943
[270]	validation_0-rmse:0.13951
[271]	validation_0-rmse:0.13951
[272]	validation_0-rmse:0.13948
[273]	validation_0-rmse:0.13948
[274]	validation_0-rmse:0.13948
[275]	validation_0-rmse:0.13937
[276]	validation_0-rmse:0.13932
[277]	validation_0-rmse:0.13932
[278]	va

[504]	validation_0-rmse:0.13703
[505]	validation_0-rmse:0.13703
[506]	validation_0-rmse:0.13703
[507]	validation_0-rmse:0.13703
[508]	validation_0-rmse:0.13697
[509]	validation_0-rmse:0.13696
[510]	validation_0-rmse:0.13696
[511]	validation_0-rmse:0.13696
[512]	validation_0-rmse:0.13690
[513]	validation_0-rmse:0.13692
[514]	validation_0-rmse:0.13693
[515]	validation_0-rmse:0.13693
[516]	validation_0-rmse:0.13693
[517]	validation_0-rmse:0.13693
[518]	validation_0-rmse:0.13693
[519]	validation_0-rmse:0.13693
[520]	validation_0-rmse:0.13689
[521]	validation_0-rmse:0.13689
[522]	validation_0-rmse:0.13681
[523]	validation_0-rmse:0.13681
[524]	validation_0-rmse:0.13681
[525]	validation_0-rmse:0.13681
[526]	validation_0-rmse:0.13684
[527]	validation_0-rmse:0.13685
[528]	validation_0-rmse:0.13685
[529]	validation_0-rmse:0.13685
[530]	validation_0-rmse:0.13675
[531]	validation_0-rmse:0.13675
[532]	validation_0-rmse:0.13675
[533]	validation_0-rmse:0.13675
[534]	validation_0-rmse:0.13675
[535]	va

[761]	validation_0-rmse:0.13605
[762]	validation_0-rmse:0.13605
[763]	validation_0-rmse:0.13605
[764]	validation_0-rmse:0.13605
[765]	validation_0-rmse:0.13605
[766]	validation_0-rmse:0.13604
[767]	validation_0-rmse:0.13604
[768]	validation_0-rmse:0.13604
[769]	validation_0-rmse:0.13604
[770]	validation_0-rmse:0.13604
[771]	validation_0-rmse:0.13594
[772]	validation_0-rmse:0.13594
[773]	validation_0-rmse:0.13594
[774]	validation_0-rmse:0.13594
[775]	validation_0-rmse:0.13594
[776]	validation_0-rmse:0.13594
[777]	validation_0-rmse:0.13594
[778]	validation_0-rmse:0.13594
[779]	validation_0-rmse:0.13594
[780]	validation_0-rmse:0.13594
[781]	validation_0-rmse:0.13597
[782]	validation_0-rmse:0.13597
[783]	validation_0-rmse:0.13597
[784]	validation_0-rmse:0.13597
[785]	validation_0-rmse:0.13596
[786]	validation_0-rmse:0.13596
[787]	validation_0-rmse:0.13596
[788]	validation_0-rmse:0.13596
[789]	validation_0-rmse:0.13596
[790]	validation_0-rmse:0.13590
[791]	validation_0-rmse:0.13591
[792]	va

[1017]	validation_0-rmse:0.13549
[1018]	validation_0-rmse:0.13549
[1019]	validation_0-rmse:0.13549
[1020]	validation_0-rmse:0.13552
[1021]	validation_0-rmse:0.13552
[1022]	validation_0-rmse:0.13551
[1023]	validation_0-rmse:0.13551
[1024]	validation_0-rmse:0.13551
[1025]	validation_0-rmse:0.13551
[1026]	validation_0-rmse:0.13551
[1027]	validation_0-rmse:0.13551
[1028]	validation_0-rmse:0.13551
[1029]	validation_0-rmse:0.13554
[1030]	validation_0-rmse:0.13557
[1031]	validation_0-rmse:0.13557
[1032]	validation_0-rmse:0.13557
[1033]	validation_0-rmse:0.13557
[1034]	validation_0-rmse:0.13557
[1035]	validation_0-rmse:0.13557
[1036]	validation_0-rmse:0.13557
[1037]	validation_0-rmse:0.13557
[1038]	validation_0-rmse:0.13557
[1039]	validation_0-rmse:0.13557
[1040]	validation_0-rmse:0.13557
[1041]	validation_0-rmse:0.13557
[1042]	validation_0-rmse:0.13558
[1043]	validation_0-rmse:0.13560
[1044]	validation_0-rmse:0.13560
[1045]	validation_0-rmse:0.13560
[1046]	validation_0-rmse:0.13555
[1047]	val

[1266]	validation_0-rmse:0.13627
[1267]	validation_0-rmse:0.13627
[1268]	validation_0-rmse:0.13627
[1269]	validation_0-rmse:0.13627
[1270]	validation_0-rmse:0.13627
[1271]	validation_0-rmse:0.13627
[1272]	validation_0-rmse:0.13627
[1273]	validation_0-rmse:0.13627
[1274]	validation_0-rmse:0.13627
[1275]	validation_0-rmse:0.13627
[1276]	validation_0-rmse:0.13627
[1277]	validation_0-rmse:0.13627
[1278]	validation_0-rmse:0.13627
[1279]	validation_0-rmse:0.13627
[1280]	validation_0-rmse:0.13627
[1281]	validation_0-rmse:0.13627
[1282]	validation_0-rmse:0.13626
[1283]	validation_0-rmse:0.13626
[1284]	validation_0-rmse:0.13626
[1285]	validation_0-rmse:0.13626
[1286]	validation_0-rmse:0.13626
[1287]	validation_0-rmse:0.13626
[1288]	validation_0-rmse:0.13626
[1289]	validation_0-rmse:0.13626
[1290]	validation_0-rmse:0.13626
[1291]	validation_0-rmse:0.13630
[1292]	validation_0-rmse:0.13630
[1293]	validation_0-rmse:0.13637
[1294]	validation_0-rmse:0.13637
[1295]	validation_0-rmse:0.13632
[1296]	val

[1515]	validation_0-rmse:0.13625
[1516]	validation_0-rmse:0.13625
[1517]	validation_0-rmse:0.13625
[1518]	validation_0-rmse:0.13625
[1519]	validation_0-rmse:0.13625
[1520]	validation_0-rmse:0.13625
[1521]	validation_0-rmse:0.13626
[1522]	validation_0-rmse:0.13626
[1523]	validation_0-rmse:0.13626
[1524]	validation_0-rmse:0.13626
[1525]	validation_0-rmse:0.13625
[1526]	validation_0-rmse:0.13625
[1527]	validation_0-rmse:0.13625
[1528]	validation_0-rmse:0.13625
[1529]	validation_0-rmse:0.13625
[1530]	validation_0-rmse:0.13625
[1531]	validation_0-rmse:0.13625
[1532]	validation_0-rmse:0.13625
[1533]	validation_0-rmse:0.13625
[1534]	validation_0-rmse:0.13625
[1535]	validation_0-rmse:0.13625
[1536]	validation_0-rmse:0.13625
[1537]	validation_0-rmse:0.13625
[1538]	validation_0-rmse:0.13625
[1539]	validation_0-rmse:0.13625
[1540]	validation_0-rmse:0.13625
[1541]	validation_0-rmse:0.13625
[1542]	validation_0-rmse:0.13625
[1543]	validation_0-rmse:0.13626
[1544]	validation_0-rmse:0.13626
[1545]	val

[1764]	validation_0-rmse:0.13593
[1765]	validation_0-rmse:0.13593
[1766]	validation_0-rmse:0.13593
[1767]	validation_0-rmse:0.13592
[1768]	validation_0-rmse:0.13592
[1769]	validation_0-rmse:0.13592
[1770]	validation_0-rmse:0.13592
[1771]	validation_0-rmse:0.13590
[1772]	validation_0-rmse:0.13589
[1773]	validation_0-rmse:0.13589
[1774]	validation_0-rmse:0.13589
[1775]	validation_0-rmse:0.13589
[1776]	validation_0-rmse:0.13589
[1777]	validation_0-rmse:0.13589
[1778]	validation_0-rmse:0.13589
[1779]	validation_0-rmse:0.13589
[1780]	validation_0-rmse:0.13595
[1781]	validation_0-rmse:0.13587
[1782]	validation_0-rmse:0.13587
[1783]	validation_0-rmse:0.13584
[1784]	validation_0-rmse:0.13584
[1785]	validation_0-rmse:0.13584
[1786]	validation_0-rmse:0.13584
[1787]	validation_0-rmse:0.13587
[1788]	validation_0-rmse:0.13587
[1789]	validation_0-rmse:0.13587
[1790]	validation_0-rmse:0.13587
[1791]	validation_0-rmse:0.13587
[1792]	validation_0-rmse:0.13587
[1793]	validation_0-rmse:0.13587
[1794]	val

[2013]	validation_0-rmse:0.13572
[2014]	validation_0-rmse:0.13572
[2015]	validation_0-rmse:0.13572
[2016]	validation_0-rmse:0.13572
[2017]	validation_0-rmse:0.13572
[2018]	validation_0-rmse:0.13572
[2019]	validation_0-rmse:0.13572
[2020]	validation_0-rmse:0.13572
[2021]	validation_0-rmse:0.13572
[2022]	validation_0-rmse:0.13572
[2023]	validation_0-rmse:0.13566
[2024]	validation_0-rmse:0.13566
[2025]	validation_0-rmse:0.13566
[2026]	validation_0-rmse:0.13566
[2027]	validation_0-rmse:0.13566
[2028]	validation_0-rmse:0.13570
[2029]	validation_0-rmse:0.13570
[2030]	validation_0-rmse:0.13570
[2031]	validation_0-rmse:0.13570
[2032]	validation_0-rmse:0.13570
[2033]	validation_0-rmse:0.13570
[2034]	validation_0-rmse:0.13570
[2035]	validation_0-rmse:0.13570
[2036]	validation_0-rmse:0.13570
[2037]	validation_0-rmse:0.13570
[2038]	validation_0-rmse:0.13570
[2039]	validation_0-rmse:0.13570
[2040]	validation_0-rmse:0.13566
[2041]	validation_0-rmse:0.13566
[2042]	validation_0-rmse:0.13566
[2043]	val

In [21]:
print("score train :",xgb_r.score(X_train, y_train))
print("score test :", xgb_r.score(X_test, y_test))

score train : 0.9499279105310362
score test : 0.8842401172571823


In [22]:
#pred = xgb_r.predict(pca.transform(df_test))
pred = xgb_r.predict(df_test)

df_pred = pd.DataFrame(np.expm1(pred), columns=['SalePrice'])
df_pred.head()

Unnamed: 0,SalePrice
0,126902.179688
1,159305.34375
2,182811.21875
3,187381.6875
4,184802.515625


In [23]:
df_pred = df_test_Id.join(df_pred)
df_pred.to_csv('./submission/model_xgb_r.csv', index=False)