In [5]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
all_data = pd.read_csv('train.csv.zip')
all_data.head(2)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0


In [7]:
#Removing duplicate or constant columns as per https://www.kaggle.com/yohanb/categorical-features-encoding-xgb-0-554
columns_to_remove = ['X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347', 
                     'X382', 'X232', 'X279', 'X35', 'X37', 'X39', 'X302', 'X113', 'X134', 'X147', 'X222', 
                     'X102', 'X214', 'X239', 'X76', 'X324', 'X248', 'X253', 'X385', 'X172', 'X216', 'X213', 
                     'X84', 'X244', 'X122', 'X243', 'X320', 'X245', 'X94', 'X242', 'X199', 'X119', 'X227', 
                     'X146', 'X226', 'X326', 'X360', 'X262', 'X266', 'X247', 'X254', 'X364', 'X365', 'X296', 'X299',
                     'X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
new_columns = [col for col in all_data.columns if col not in columns_to_remove]
data1 = all_data[new_columns]

In [8]:
data1.shape

(4209, 321)

## Baseline linear regression without categorical features

In [9]:
X = data1.iloc[:,10:].values
y = data1.iloc[:,1].values

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)

In [11]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error
model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))


Train R^2:  0.5843533315823948
Test R^2:  -6.160078533586281e+22
Test MSE:  9.528489855981273e+24


## Label encoding of all columns


In [12]:
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
test_data = pd.read_csv('test.csv.zip')
combined = pd.concat([all_data, test_data], axis=0, sort=False)

cat_column_names = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

label_encoders = {}

for col in cat_column_names:
    label_encoder = LabelEncoder()
    label_encoder.fit(combined[col])
    label_encoders[col] = label_encoder


## Mean encoding of columns
Now we will do a naive mean encoding of the categorical columns X0-X8

In [13]:
data2 = data1.copy()

In [14]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

In [15]:
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder

cat_column_names = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

for col in cat_column_names:
    data2[col] = label_encoders[col].transform(data2[col])


X = data2.iloc[:,2:].values
y = data2.iloc[:,1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)

mean_enc_columns = [data2.columns.get_loc(c) for c in data2.columns if c in cat_column_names]


m_encoder = LeaveOneOutEncoder(cols=mean_enc_columns)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)

scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha = 20) #Best alpha we could get via hyperparameter tuning
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.5753189141790043
Test R^2:  0.5875966381290332
Test MSE:  63.79109014170886


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)

m_encoder = LeaveOneOutEncoder(cols=mean_enc_columns)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)

scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha=55)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.5647513986247792
Test R^2:  0.5838897210695404
Test MSE:  64.36448090946871


The result is actually worse. Is it because we ignored the standard deviation?

We will try to achieve the best result by using random forest

In [17]:
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)


#m_encoder = LeaveOneOutEncoder(cols=mean_enc_columns)
m_encoder = TargetEncoder(cols=mean_enc_columns, smoothing=1E-2)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)


model = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)

print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.6028315218920661
Test R^2:  0.6168638369298607
Test MSE:  59.264001641680935


In [18]:
#generate submission
kaggle1 = test_data[new_columns[2:]].copy()

for col in cat_column_names:
    kaggle1[col] = label_encoders[col].transform(kaggle1[col])
    
X_kaggle = kaggle1.values

In [19]:
X_kaggle_tran = m_encoder.transform(X_kaggle)
preds_kaggle = model.predict(X_kaggle_tran)
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('te_submission.csv', index=False)

This is much better than Ridge regression, but without limiting the max depth it overfits tremendously

In [20]:
#%autoreload 2
from category_encoders.posterior_imputation import PosteriorImputationEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)


m_encoder = PosteriorImputationEncoder(cols=mean_enc_columns, n_draws=25, prior_samples_ratio=0.01, random_state=2834)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)
y_train = m_encoder.expand_y(y_train)

#print(X_train.isnull().mean())

model = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)
preds = m_encoder.average_y(preds)

print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.5902388643573367
Test R^2:  0.6133959090884106
Test MSE:  59.800425245347164


I do not see much difference. We need another example where we can prove better effectiveness of this algorithm.

In [21]:
X_kaggle_tran = m_encoder.transform(X_kaggle)
preds_kaggle = model.predict(X_kaggle_tran)
preds_kaggle = m_encoder.average_y(preds_kaggle)
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('pm_submission.csv', index=False)

## Cross validation

We will do cross-validation to ensure validity of comparison between the two approaches

In [22]:
mean_enc_column_names = [c for c in data2.columns if c in cat_column_names]

### Leave one out encoding

In [31]:
%%time
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score
from category_encoders.leave_one_out import LeaveOneOutEncoder
import optuna
from optuna.distributions import *


loo = LeaveOneOutEncoder(cols=mean_enc_column_names,  random_state=2834)
rf = RandomForestRegressor(n_estimators=400, random_state=2834, n_jobs=-1) 
pipe = Pipeline(steps=[('loo',loo), ('rf',rf)])


param_distribution = {
    'loo__sigma': LogUniformDistribution(1E-5, 1E-1),
    'rf__max_depth': IntUniformDistribution(2,40),
    'rf__max_features' : IntUniformDistribution(1,X_test.shape[1]),
    'rf__min_samples_leaf': IntUniformDistribution(1,15)
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=data1.columns[2:])
X_test = pd.DataFrame(X_test, columns=data1.columns[2:])

search = optuna.integration.OptunaSearchCV(pipe, param_distribution, 
                            cv=5, n_jobs=-1, random_state=514, n_trials=None, timeout=20*60, scoring='r2')
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
test_predict = search.best_estimator_.predict(X_test)
print('Test R^2: ', r2_score(y_test, test_predict))

[I 2020-05-16 14:01:37,521] Finished trial#6 with value: 0.5511313950252014 with parameters: {'loo__sigma': 0.002528886859725847, 'rf__max_depth': 2, 'rf__max_features': 164, 'rf__min_samples_leaf': 1}. Best is trial#6 with value: 0.5511313950252014.
[I 2020-05-16 14:01:43,008] Finished trial#10 with value: 0.5013071666467804 with parameters: {'loo__sigma': 4.90590123230136e-05, 'rf__max_depth': 8, 'rf__max_features': 11, 'rf__min_samples_leaf': 14}. Best is trial#6 with value: 0.5511313950252014.
[I 2020-05-16 14:01:44,936] Finished trial#4 with value: 0.5242331877383426 with parameters: {'loo__sigma': 0.0006641296922897638, 'rf__max_depth': 39, 'rf__max_features': 8, 'rf__min_samples_leaf': 6}. Best is trial#6 with value: 0.5511313950252014.
[I 2020-05-16 14:01:48,822] Finished trial#3 with value: 0.5561280747518056 with parameters: {'loo__sigma': 0.0020542205029519564, 'rf__max_depth': 34, 'rf__max_features': 33, 'rf__min_samples_leaf': 15}. Best is trial#3 with value: 0.55612807475

[I 2020-05-16 14:04:21,079] Finished trial#32 with value: 0.5610036901684856 with parameters: {'loo__sigma': 0.04528246865220672, 'rf__max_depth': 6, 'rf__max_features': 78, 'rf__min_samples_leaf': 8}. Best is trial#28 with value: 0.5648196772881638.
[I 2020-05-16 14:04:30,040] Finished trial#33 with value: 0.5628098179066445 with parameters: {'loo__sigma': 0.08746514053802604, 'rf__max_depth': 6, 'rf__max_features': 127, 'rf__min_samples_leaf': 8}. Best is trial#28 with value: 0.5648196772881638.
[I 2020-05-16 14:04:33,131] Finished trial#34 with value: 0.5617027649214765 with parameters: {'loo__sigma': 0.028902555581829175, 'rf__max_depth': 6, 'rf__max_features': 193, 'rf__min_samples_leaf': 8}. Best is trial#28 with value: 0.5648196772881638.
[I 2020-05-16 14:04:44,965] Finished trial#35 with value: 0.5631992502191538 with parameters: {'loo__sigma': 0.02094576505466469, 'rf__max_depth': 5, 'rf__max_features': 203, 'rf__min_samples_leaf': 8}. Best is trial#28 with value: 0.5648196772

[I 2020-05-16 14:07:33,992] Finished trial#64 with value: 0.5625681781445089 with parameters: {'loo__sigma': 0.050501063869844376, 'rf__max_depth': 3, 'rf__max_features': 249, 'rf__min_samples_leaf': 7}. Best is trial#28 with value: 0.5648196772881638.
[I 2020-05-16 14:07:35,832] Finished trial#65 with value: 0.5628693228700768 with parameters: {'loo__sigma': 0.00388392174696119, 'rf__max_depth': 3, 'rf__max_features': 150, 'rf__min_samples_leaf': 7}. Best is trial#28 with value: 0.5648196772881638.
[I 2020-05-16 14:07:41,751] Finished trial#66 with value: 0.22079600711054378 with parameters: {'loo__sigma': 1.0540444097081543e-05, 'rf__max_depth': 3, 'rf__max_features': 245, 'rf__min_samples_leaf': 7}. Best is trial#28 with value: 0.5648196772881638.
[I 2020-05-16 14:07:44,627] Finished trial#67 with value: 0.556544481287643 with parameters: {'loo__sigma': 0.004029917230939615, 'rf__max_depth': 2, 'rf__max_features': 247, 'rf__min_samples_leaf': 7}. Best is trial#28 with value: 0.56481

[I 2020-05-16 14:10:17,778] Finished trial#97 with value: 0.5624133671600101 with parameters: {'loo__sigma': 0.0015023039236827, 'rf__max_depth': 8, 'rf__max_features': 110, 'rf__min_samples_leaf': 4}. Best is trial#80 with value: 0.5659349307894198.
[I 2020-05-16 14:10:18,090] Finished trial#96 with value: 0.5632266735603364 with parameters: {'loo__sigma': 0.002645465288377011, 'rf__max_depth': 8, 'rf__max_features': 112, 'rf__min_samples_leaf': 4}. Best is trial#80 with value: 0.5659349307894198.
[I 2020-05-16 14:10:19,035] Finished trial#98 with value: 0.5624178586055498 with parameters: {'loo__sigma': 0.0029615968545737364, 'rf__max_depth': 8, 'rf__max_features': 113, 'rf__min_samples_leaf': 4}. Best is trial#80 with value: 0.5659349307894198.
[I 2020-05-16 14:10:19,555] Finished trial#99 with value: 0.5624669949982322 with parameters: {'loo__sigma': 0.0017271458836475848, 'rf__max_depth': 8, 'rf__max_features': 118, 'rf__min_samples_leaf': 4}. Best is trial#80 with value: 0.565934

[I 2020-05-16 14:13:00,365] Finished trial#125 with value: 0.5589751712630475 with parameters: {'loo__sigma': 0.08060012552371003, 'rf__max_depth': 12, 'rf__max_features': 141, 'rf__min_samples_leaf': 3}. Best is trial#113 with value: 0.5664329341963392.
[I 2020-05-16 14:13:02,061] Finished trial#130 with value: 0.5646845069518449 with parameters: {'loo__sigma': 0.03196938318740227, 'rf__max_depth': 6, 'rf__max_features': 142, 'rf__min_samples_leaf': 3}. Best is trial#113 with value: 0.5664329341963392.
[I 2020-05-16 14:13:07,476] Finished trial#128 with value: 0.5586307566675242 with parameters: {'loo__sigma': 0.09809709964304149, 'rf__max_depth': 12, 'rf__max_features': 141, 'rf__min_samples_leaf': 3}. Best is trial#113 with value: 0.5664329341963392.
[I 2020-05-16 14:13:12,736] Finished trial#131 with value: 0.565503481952297 with parameters: {'loo__sigma': 0.0969805931563049, 'rf__max_depth': 6, 'rf__max_features': 139, 'rf__min_samples_leaf': 5}. Best is trial#113 with value: 0.56

[I 2020-05-16 14:15:44,642] Finished trial#155 with value: 0.5493501564603454 with parameters: {'loo__sigma': 0.07813783671839772, 'rf__max_depth': 35, 'rf__max_features': 163, 'rf__min_samples_leaf': 1}. Best is trial#113 with value: 0.5664329341963392.
[I 2020-05-16 14:15:52,036] Finished trial#162 with value: 0.5635796762497933 with parameters: {'loo__sigma': 0.07527445273336349, 'rf__max_depth': 3, 'rf__max_features': 164, 'rf__min_samples_leaf': 1}. Best is trial#113 with value: 0.5664329341963392.
[I 2020-05-16 14:15:53,291] Finished trial#159 with value: 0.5510052213310409 with parameters: {'loo__sigma': 0.07773778399626277, 'rf__max_depth': 19, 'rf__max_features': 163, 'rf__min_samples_leaf': 1}. Best is trial#113 with value: 0.5664329341963392.
[I 2020-05-16 14:15:55,513] Finished trial#163 with value: 0.5636307737233277 with parameters: {'loo__sigma': 0.07802634213466741, 'rf__max_depth': 3, 'rf__max_features': 164, 'rf__min_samples_leaf': 1}. Best is trial#113 with value: 0.

[I 2020-05-16 14:18:23,921] Finished trial#192 with value: 0.5643063394371095 with parameters: {'loo__sigma': 0.036480247335157874, 'rf__max_depth': 4, 'rf__max_features': 172, 'rf__min_samples_leaf': 4}. Best is trial#179 with value: 0.5667095843750911.
[I 2020-05-16 14:18:34,135] Finished trial#193 with value: 0.5663341757528058 with parameters: {'loo__sigma': 0.06270361674828998, 'rf__max_depth': 4, 'rf__max_features': 173, 'rf__min_samples_leaf': 4}. Best is trial#179 with value: 0.5667095843750911.
[I 2020-05-16 14:18:37,186] Finished trial#194 with value: 0.5644439718949169 with parameters: {'loo__sigma': 0.035737595735244286, 'rf__max_depth': 4, 'rf__max_features': 173, 'rf__min_samples_leaf': 4}. Best is trial#179 with value: 0.5667095843750911.
[I 2020-05-16 14:18:41,737] Finished trial#195 with value: 0.566033532540986 with parameters: {'loo__sigma': 0.06412909881086896, 'rf__max_depth': 4, 'rf__max_features': 172, 'rf__min_samples_leaf': 2}. Best is trial#179 with value: 0.5

[I 2020-05-16 14:20:51,884] Finished trial#224 with value: 0.5659420457528019 with parameters: {'loo__sigma': 0.04532016223787684, 'rf__max_depth': 5, 'rf__max_features': 157, 'rf__min_samples_leaf': 2}. Best is trial#215 with value: 0.5671316924094325.
[I 2020-05-16 14:20:53,113] Finished trial#225 with value: 0.5660029327374205 with parameters: {'loo__sigma': 0.04506717797694481, 'rf__max_depth': 5, 'rf__max_features': 158, 'rf__min_samples_leaf': 2}. Best is trial#215 with value: 0.5671316924094325.
[I 2020-05-16 14:20:53,804] Finished trial#226 with value: 0.565605540723132 with parameters: {'loo__sigma': 0.04222731530950962, 'rf__max_depth': 5, 'rf__max_features': 145, 'rf__min_samples_leaf': 2}. Best is trial#215 with value: 0.5671316924094325.
[I 2020-05-16 14:20:58,618] Finished trial#229 with value: 0.5135156437494086 with parameters: {'loo__sigma': 2.4406580579246198e-05, 'rf__max_depth': 5, 'rf__max_features': 158, 'rf__min_samples_leaf': 2}. Best is trial#215 with value: 0.

Best parameter (CV score=0.567):
{'loo__sigma': 0.06143626769580882, 'rf__max_depth': 5, 'rf__max_features': 155, 'rf__min_samples_leaf': 2}
Test R^2:  0.6112180976053222
Wall time: 20min 31s


In [32]:
preds_kaggle = search.best_estimator_.predict(pd.DataFrame(X_kaggle, columns=data1.columns[2:]))
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('te517_submission.csv', index=False)

In [33]:
%%time
#%autoreload 2
from category_encoders.pte_utils import EncoderWrapperR
from sklearn.model_selection import cross_val_score

pte = PosteriorImputationEncoder(cols=mean_enc_column_names, random_state=2834)
model = RandomForestRegressor(n_estimators=400, random_state=2834, n_jobs=-1) 
wrapper_model = EncoderWrapperR(pte, model)

param_distribution = {
    'encoder__prior_samples_ratio': LogUniformDistribution(1E-9, 1E-1),
    'encoder__n_draws': IntUniformDistribution(1,40),
    'encoder__include_precision': CategoricalDistribution([False, True]),
    'regressor__max_depth': IntUniformDistribution(2,40),
    'regressor__max_features' : IntUniformDistribution(1,X_test.shape[1]),
    'regressor__min_samples_leaf': IntUniformDistribution(1,15)
}


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=data1.columns[2:])
X_test = pd.DataFrame(X_test, columns=data1.columns[2:])

search = optuna.integration.OptunaSearchCV(wrapper_model, param_distribution, 
                cv=5, n_jobs=-1, random_state=514, n_trials=None, timeout=2*60*60, scoring='r2')

search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
test_predict = search.best_estimator_.predict(X_test)
print('Test R^2: ', r2_score(y_test, test_predict))

[I 2020-05-16 14:25:46,519] Finished trial#0 with value: 0.5367037621434184 with parameters: {'encoder__prior_samples_ratio': 0.00019547001926932382, 'encoder__n_draws': 5, 'encoder__include_precision': False, 'regressor__max_depth': 23, 'regressor__max_features': 10, 'regressor__min_samples_leaf': 3}. Best is trial#0 with value: 0.5367037621434184.
[I 2020-05-16 14:31:48,856] Finished trial#5 with value: 0.5523780482239933 with parameters: {'encoder__prior_samples_ratio': 0.0005529974784712669, 'encoder__n_draws': 5, 'encoder__include_precision': True, 'regressor__max_depth': 13, 'regressor__max_features': 131, 'regressor__min_samples_leaf': 12}. Best is trial#5 with value: 0.5523780482239933.
[I 2020-05-16 14:36:50,789] Finished trial#6 with value: 0.5330232135217065 with parameters: {'encoder__prior_samples_ratio': 0.0001707926862003511, 'encoder__n_draws': 12, 'encoder__include_precision': True, 'regressor__max_depth': 28, 'regressor__max_features': 31, 'regressor__min_samples_leaf

[I 2020-05-16 15:48:55,030] Finished trial#29 with value: 0.5538007649961341 with parameters: {'encoder__prior_samples_ratio': 1.639286125129717e-05, 'encoder__n_draws': 27, 'encoder__include_precision': False, 'regressor__max_depth': 2, 'regressor__max_features': 183, 'regressor__min_samples_leaf': 3}. Best is trial#7 with value: 0.5678006949559029.
[I 2020-05-16 15:50:09,391] Finished trial#26 with value: 0.5669839914586193 with parameters: {'encoder__prior_samples_ratio': 2.171401504317188e-05, 'encoder__n_draws': 30, 'encoder__include_precision': True, 'regressor__max_depth': 4, 'regressor__max_features': 75, 'regressor__min_samples_leaf': 4}. Best is trial#7 with value: 0.5678006949559029.
[I 2020-05-16 15:53:38,854] Finished trial#28 with value: 0.5665717739101301 with parameters: {'encoder__prior_samples_ratio': 0.005398249917547135, 'encoder__n_draws': 27, 'encoder__include_precision': True, 'regressor__max_depth': 6, 'regressor__max_features': 66, 'regressor__min_samples_leaf'

Best parameter (CV score=0.568):
{'encoder__prior_samples_ratio': 1.6252754365317875e-05, 'encoder__n_draws': 33, 'encoder__include_precision': True, 'regressor__max_depth': 5, 'regressor__max_features': 71, 'regressor__min_samples_leaf': 4}
Test R^2:  0.6105991574122818
Wall time: 3h 4min 51s


In [34]:
preds_kaggle = search.best_estimator_.predict(pd.DataFrame(X_kaggle, columns=data1.columns[2:]))
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('sa517_submission.csv', index=False)