In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn import neighbors
from sklearn.decomposition import PCA
from sklearn import linear_model, preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn import model_selection
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def df_corr_columns(df, target, threshold):
    df_corr = df.corr()
    df_corr_SalePrice = df_corr[[target]]
    df_corr_SalePrice = df_corr_SalePrice.reset_index()
    df_corr_SalePrice = df_corr_SalePrice[(df_corr_SalePrice['index'] != target)]
    df_corr_SalePrice_columns = df_corr_SalePrice[(
        (df_corr_SalePrice[target] > threshold) | 
        (df_corr_SalePrice[target] < -threshold))]['index']
    return df_corr_SalePrice_columns

def separe_per_unique_threshold(df, threshold):
    dic_c_unique_len = {}
    for c in df.columns:
        dic_c_unique_len[c] = len(df[c].unique())
        #print(c, len(df_num[c].unique()))
 

    df_c_unique_len = pd.DataFrame.from_dict(dic_c_unique_len, orient="index").reset_index()
    df_c_unique_len.columns=['variable', 'count']
    return df_c_unique_len[df_c_unique_len['count'] <= 20].sort_values('count'), df_c_unique_len[df_c_unique_len['count'] > 20].sort_values('count')

def dummies(df, quantiles = 4):
    
    columns = df.columns
    for c in columns:
        
        df[c] = pd.qcut(df[c], q=quantiles, duplicates='drop') # , labels=dummies_cols
        
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1)
        
    df.columns = [c.replace(',', '_') for c in df.columns]
    df.columns = [c.replace('(', '_') for c in df.columns]
    df.columns = [c.replace(']', '_') for c in df.columns]
    df.columns = [c.replace('.', '_') for c in df.columns]
           
        
    df.drop(columns, axis=1, inplace=True)
        
    return df

def encode_cat(df):
    
    columns = df.columns
    for c in columns:
        
        le = preprocessing.LabelEncoder()
        le.fit(df[c].unique())
        df[c] = le.transform(df[c])
        
    return df
    
    

corr_threshold = 0
dummies_quantiles = 4
variance_threshold = 0.1
unique_values_threshold = 20
pca_components = 0.999


In [3]:
df_train = pd.read_csv("./dataset/train.csv")

df_train_Id = df_train[['Id']]

# Outliers
df_train.drop(df_train[(df_train['YearBuilt'] < 1900) & (df_train['SalePrice'] > 250000)].index, inplace=True)
df_train.drop(df_train[(df_train['YearBuilt'] > 1980) & (df_train['SalePrice'] > 700000)].index, inplace=True)
df_train.drop(df_train[(df_train['GrLivArea'] > 4000)].index, inplace=True)

df_train_SalePrice = df_train[['SalePrice']]

df_train.drop(['Id'], axis=1, inplace=True)

df_test = pd.read_csv("./dataset/test.csv")
df_test_Id = df_test[['Id']]
df_test.drop(['Id'], axis = 1, inplace=True)

# Concat train and test to process all datas
df_train = pd.concat([df_train, df_test])

## Fill nan
df_train_num =  df_train.select_dtypes(include='number')
df_train_num.fillna(df_train_num.median(), inplace=True)


df_train_cat = df_train.select_dtypes(include='object')
df_train_cat.fillna(df_train_cat.mode().iloc[0], inplace=True)

df_train = pd.concat([df_train_num, df_train_cat], axis=1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [4]:
df_train = df_train[df_corr_columns(df_train, 'SalePrice', corr_threshold)]

df_train_cat_cols, df_train_scaled_cols = separe_per_unique_threshold(df_train, unique_values_threshold)
# transfert Ã  la main de certaines variables pour discretisation


df_train_dummies_cols = df_train_scaled_cols[
    (df_train_scaled_cols['variable'] == 'YearBuilt') |
    (df_train_scaled_cols['variable'] == 'YearRemodAdd') |
    (df_train_scaled_cols['variable'] == 'GarageYrBlt')]

df_train_scaled_cols = df_train_scaled_cols[
    (df_train_scaled_cols['variable'] != 'YearBuilt') &
    (df_train_scaled_cols['variable'] != 'YearRemodAdd') &
    (df_train_scaled_cols['variable'] != 'GarageYrBlt')]

df_train_dummies_cols

Unnamed: 0,variable,count
6,YearRemodAdd,61
24,GarageYrBlt,103
5,YearBuilt,117


In [6]:
df_train_scaled_cols

Unnamed: 0,variable,count
30,3SsnPorch,31
14,LowQualFinSF,35
33,MiscVal,38
31,ScreenPorch,120
1,LotFrontage,128
29,EnclosedPorch,182
28,OpenPorchSF,248
9,BsmtFinSF2,272
27,WoodDeckSF,378
7,MasVnrArea,441


In [5]:
df_train_cat_cols

Unnamed: 0,variable,count
17,BsmtHalfBath,3
19,HalfBath,3
16,BsmtFullBath,4
21,KitchenAbvGr,4
18,FullBath,5
23,Fireplaces,5
35,YrSold,5
25,GarageCars,6
20,BedroomAbvGr,8
4,OverallCond,9


In [7]:
df_train[df_train_cat_cols['variable'].tolist()].head()

Unnamed: 0,BsmtHalfBath,HalfBath,BsmtFullBath,KitchenAbvGr,FullBath,Fireplaces,YrSold,GarageCars,BedroomAbvGr,OverallCond,OverallQual,PoolArea,MoSold,TotRmsAbvGrd,MSSubClass
0,0.0,1,1.0,1,2,0,2008,2.0,3,5,7,0,2,8,60
1,1.0,0,0.0,1,2,1,2007,2.0,3,8,6,0,5,6,20
2,0.0,1,1.0,1,2,1,2008,2.0,3,5,7,0,9,6,60
3,0.0,0,1.0,1,1,1,2006,3.0,3,5,7,0,2,7,70
4,0.0,1,1.0,1,2,1,2008,3.0,4,5,8,0,12,9,60


In [8]:
df_train_cat = encode_cat(df_train[df_train_cat_cols['variable'].tolist()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = le.transform(df[c])


In [9]:
df_train_cat.head()

Unnamed: 0,BsmtHalfBath,HalfBath,BsmtFullBath,KitchenAbvGr,FullBath,Fireplaces,YrSold,GarageCars,BedroomAbvGr,OverallCond,OverallQual,PoolArea,MoSold,TotRmsAbvGrd,MSSubClass
0,0,1,1,1,2,0,2,2,3,4,6,0,1,6,5
1,1,0,0,1,2,1,1,2,3,7,5,0,4,4,0
2,0,1,1,1,2,1,2,2,3,4,6,0,8,4,5
3,0,0,1,1,1,1,0,3,3,4,6,0,1,5,6
4,0,1,1,1,2,1,2,3,4,4,7,0,11,7,5


In [10]:
df_train_dummies = dummies(df_train[df_train_dummies_cols['variable'].tolist()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.qcut(df[c], q=quantiles, duplicates='drop') # , labels=dummies_cols


In [11]:
df_train_dummies.head()

Unnamed: 0,YearRemodAdd__1949_999_ 1965_0_,YearRemodAdd__1965_0_ 1993_0_,YearRemodAdd__1993_0_ 2004_0_,YearRemodAdd__2004_0_ 2010_0_,GarageYrBlt__1894_999_ 1961_0_,GarageYrBlt__1961_0_ 1979_0_,GarageYrBlt__1979_0_ 2001_0_,GarageYrBlt__2001_0_ 2207_0_,YearBuilt__1871_999_ 1954_0_,YearBuilt__1954_0_ 1973_0_,YearBuilt__1973_0_ 2001_0_,YearBuilt__2001_0_ 2010_0_
0,0,0,1,0,0,0,0,1,0,0,0,1
1,0,1,0,0,0,1,0,0,0,0,1,0
2,0,0,1,0,0,0,1,0,0,0,1,0
3,0,1,0,0,0,0,1,0,1,0,0,0
4,0,0,1,0,0,0,1,0,0,0,1,0


In [12]:
# Scaled
df_train_scaled = df_train[df_train_scaled_cols['variable'].tolist()]

scaler = preprocessing.StandardScaler().fit(df_train_scaled)
df_train_scaled[df_train_scaled.columns] = pd.DataFrame(scaler.transform(df_train_scaled), index=df_train_scaled.index)

df_train_scaled.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,3SsnPorch,LowQualFinSF,MiscVal,ScreenPorch,LotFrontage,EnclosedPorch,OpenPorchSF,BsmtFinSF2,WoodDeckSF,MasVnrArea,GarageArea,2ndFlrSF,BsmtFinSF1,TotalBsmtSF,1stFlrSF,BsmtUnfSF,GrLivArea,LotArea
0,-0.103474,-0.099696,-0.089716,-0.286275,-0.189292,-0.360063,0.2094,-0.293462,-0.738938,0.538113,0.356719,1.227949,0.60381,-0.44898,-0.782325,-0.932996,0.443045,-0.214575
1,-0.103474,-0.099696,-0.089716,-0.286275,0.533844,-0.360063,-0.70769,-0.293462,1.617934,-0.56904,-0.054343,-0.785512,1.218183,0.500077,0.27523,-0.627798,-0.475947,-0.06723
2,-0.103474,-0.099696,-0.089716,-0.286275,-0.044665,-0.360063,-0.076251,-0.293462,-0.738938,0.346056,0.636989,1.256241,0.106891,-0.299375,-0.615616,-0.286158,0.598945,0.144178
3,-0.103474,-0.099696,-0.089716,-0.286275,-0.430338,3.892679,-0.181491,-0.293462,-0.738938,-0.56904,0.795808,0.996896,-0.502964,-0.682738,-0.508819,-0.044732,0.457404,-0.073636
4,-0.103474,-0.099696,-0.089716,-0.286275,0.72668,-0.360063,0.555188,-0.293462,0.779584,1.408018,1.702013,1.697128,0.488616,0.226581,-0.029533,-0.158612,1.444089,0.529839


In [13]:
df_train = pd.concat([df_train_cat, df_train_scaled, df_train_dummies], axis=1)
df_train.shape

(2911, 45)

In [14]:
df_test = df_train.tail(1459) 
df_train = df_train.head(df_train_SalePrice.shape[0]) # shape minus outliers 

print(df_train.shape, df_test.shape)

(1452, 45) (1459, 45)


In [15]:
target = df_train_SalePrice
data = df_train

In [16]:
# centrer la distribution de la cible
target = np.log1p(target['SalePrice'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3, random_state = 66) 

In [18]:
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
print(xgboost.__version__)

1.5.2


In [59]:
# create an xgboost regression model
#model_xgbr = XGBRegressor()

# Instantiation
'''
xgb_r = XGBRegressor(objective ='reg:squarederror', learning_rate=0.3, subsample=0.8, colsample_bytree=1, gamma=0.1,
                     max_depth=4, 
                  n_estimators = 830, seed = 123)
'''

xgb_r = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

  
# Fitting the model
eval_set = [(X_test, y_test)]
xgb_r.fit(X_train, y_train, eval_metric="rmse", eval_set=eval_set, verbose=True)
  
# Predict the model
pred = xgb_r.predict(X_test)
  
# RMSE Computation
rmse = np.sqrt(mean_squared_error(y_test, pred))
print("RMSE : % f" %(rmse))

[0]	validation_0-rmse:8.07062
[1]	validation_0-rmse:5.65678
[2]	validation_0-rmse:3.96779
[3]	validation_0-rmse:2.78527
[4]	validation_0-rmse:1.96198
[5]	validation_0-rmse:1.38555
[6]	validation_0-rmse:0.98338
[7]	validation_0-rmse:0.70253
[8]	validation_0-rmse:0.51079
[9]	validation_0-rmse:0.37983
[10]	validation_0-rmse:0.29284
[11]	validation_0-rmse:0.23592
[12]	validation_0-rmse:0.20245
[13]	validation_0-rmse:0.18271
[14]	validation_0-rmse:0.16882
[15]	validation_0-rmse:0.16099
[16]	validation_0-rmse:0.15640
[17]	validation_0-rmse:0.15324
[18]	validation_0-rmse:0.15236
[19]	validation_0-rmse:0.15182
[20]	validation_0-rmse:0.15162
[21]	validation_0-rmse:0.15083
[22]	validation_0-rmse:0.14975
[23]	validation_0-rmse:0.14932
[24]	validation_0-rmse:0.14866
[25]	validation_0-rmse:0.14825
[26]	validation_0-rmse:0.14769
[27]	validation_0-rmse:0.14769
[28]	validation_0-rmse:0.14771
[29]	validation_0-rmse:0.14771
[30]	validation_0-rmse:0.14631
[31]	validation_0-rmse:0.14632
[32]	validation_0-

[260]	validation_0-rmse:0.14296
[261]	validation_0-rmse:0.14297
[262]	validation_0-rmse:0.14297
[263]	validation_0-rmse:0.14296
[264]	validation_0-rmse:0.14297
[265]	validation_0-rmse:0.14296
[266]	validation_0-rmse:0.14297
[267]	validation_0-rmse:0.14297
[268]	validation_0-rmse:0.14297
[269]	validation_0-rmse:0.14297
[270]	validation_0-rmse:0.14297
[271]	validation_0-rmse:0.14296
[272]	validation_0-rmse:0.14296
[273]	validation_0-rmse:0.14296
[274]	validation_0-rmse:0.14297
[275]	validation_0-rmse:0.14296
[276]	validation_0-rmse:0.14296
[277]	validation_0-rmse:0.14296
[278]	validation_0-rmse:0.14297
[279]	validation_0-rmse:0.14297
[280]	validation_0-rmse:0.14297
[281]	validation_0-rmse:0.14279
[282]	validation_0-rmse:0.14279
[283]	validation_0-rmse:0.14279
[284]	validation_0-rmse:0.14279
[285]	validation_0-rmse:0.14279
[286]	validation_0-rmse:0.14279
[287]	validation_0-rmse:0.14279
[288]	validation_0-rmse:0.14279
[289]	validation_0-rmse:0.14279
[290]	validation_0-rmse:0.14279
[291]	va

[517]	validation_0-rmse:0.14231
[518]	validation_0-rmse:0.14231
[519]	validation_0-rmse:0.14231
[520]	validation_0-rmse:0.14231
[521]	validation_0-rmse:0.14231
[522]	validation_0-rmse:0.14231
[523]	validation_0-rmse:0.14231
[524]	validation_0-rmse:0.14231
[525]	validation_0-rmse:0.14231
[526]	validation_0-rmse:0.14231
[527]	validation_0-rmse:0.14231
[528]	validation_0-rmse:0.14231
[529]	validation_0-rmse:0.14231
[530]	validation_0-rmse:0.14231
[531]	validation_0-rmse:0.14231
[532]	validation_0-rmse:0.14231
[533]	validation_0-rmse:0.14231
[534]	validation_0-rmse:0.14231
[535]	validation_0-rmse:0.14231
[536]	validation_0-rmse:0.14231
[537]	validation_0-rmse:0.14231
[538]	validation_0-rmse:0.14231
[539]	validation_0-rmse:0.14231
[540]	validation_0-rmse:0.14231
[541]	validation_0-rmse:0.14231
[542]	validation_0-rmse:0.14231
[543]	validation_0-rmse:0.14231
[544]	validation_0-rmse:0.14231
[545]	validation_0-rmse:0.14231
[546]	validation_0-rmse:0.14231
[547]	validation_0-rmse:0.14231
[548]	va

[774]	validation_0-rmse:0.14231
[775]	validation_0-rmse:0.14231
[776]	validation_0-rmse:0.14231
[777]	validation_0-rmse:0.14231
[778]	validation_0-rmse:0.14231
[779]	validation_0-rmse:0.14231
[780]	validation_0-rmse:0.14231
[781]	validation_0-rmse:0.14231
[782]	validation_0-rmse:0.14231
[783]	validation_0-rmse:0.14231
[784]	validation_0-rmse:0.14231
[785]	validation_0-rmse:0.14231
[786]	validation_0-rmse:0.14231
[787]	validation_0-rmse:0.14231
[788]	validation_0-rmse:0.14231
[789]	validation_0-rmse:0.14232
[790]	validation_0-rmse:0.14231
[791]	validation_0-rmse:0.14231
[792]	validation_0-rmse:0.14231
[793]	validation_0-rmse:0.14231
[794]	validation_0-rmse:0.14231
[795]	validation_0-rmse:0.14231
[796]	validation_0-rmse:0.14231
[797]	validation_0-rmse:0.14231
[798]	validation_0-rmse:0.14231
[799]	validation_0-rmse:0.14231
[800]	validation_0-rmse:0.14231
[801]	validation_0-rmse:0.14231
[802]	validation_0-rmse:0.14231
[803]	validation_0-rmse:0.14231
[804]	validation_0-rmse:0.14231
[805]	va

In [60]:
print("score train :",xgb_r.score(X_train, y_train))
print("score test :", xgb_r.score(X_test, y_test))

score train : 0.9527804095581764
score test : 0.8730240817220112


In [48]:
#pred = xgb_r.predict(pca.transform(df_test))
pred = xgb_r.predict(df_test)

df_pred = pd.DataFrame(np.expm1(pred), columns=['SalePrice'])
df_pred.head()

Unnamed: 0,SalePrice
0,135217.6875
1,159440.15625
2,171187.234375
3,181970.375
4,204164.359375


In [37]:
df_pred = df_test_Id.join(df_pred)
df_pred.to_csv('./submission/model_xgb_r.csv', index=False)