In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import lightgbm as lgb
import hyperopt

In [2]:
DATA_PATH = 'E:/Kaggle/Avito/'
seed = 32

In [3]:
train_files = ['Meta_LGB1.csv','Meta_LGBSVD1.csv','Meta_LGBRidge.csv','Meta_LGB106205.csv','Meta_LGB106204.csv',
               'Meta_LGB106203.csv','Meta_LGB106202.csv','Meta_LGB10620.csv','Meta_LGBImages_v3.csv']
test_files = ['0.2155_Predictions_LGB_v2.csv',
              '0.2156_Predictions_LGBSVD_v2.csv',
              '0.2163_Predictions_LGBRidge.csv',
              '0.21514_Predictions_LGB_v206205.csv',
              '0.2152_Predictions_LGB_v206204.csv',
              '0.2151_Predictions_LGB_v206203.csv',
              '0.2152_Predictions_LGB_v206202.csv',
              '0.2153_Predictions_LGB_v20620.csv',
              '0.21512_Predictions_LGBImages_v3.csv']

In [4]:
train = pd.read_csv(DATA_PATH+'train.csv')
test = pd.read_csv(DATA_PATH+'test.csv')
y = train['deal_probability']

In [5]:
frames = []
for f in train_files:
    frames.append(pd.read_csv(DATA_PATH+'Stacking/'+f)[:len(train)].iloc[:,1:2].values)

In [6]:
X_meta = np.concatenate(frames,axis=1)

In [7]:
frames = []
for f in test_files:
    frames.append(pd.read_csv(DATA_PATH+'Stacking/'+f)[:len(train)].iloc[:,1:2].values)

In [8]:
X_test = np.concatenate(frames,axis=1)

In [9]:
pd.DataFrame(X_meta).corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.984227,0.966973,0.984309,0.986109,0.986171,0.986365,0.987286,0.986377
1,0.984227,1.0,0.965924,0.982783,0.984204,0.983849,0.984478,0.985146,0.982572
2,0.966973,0.965924,1.0,0.966835,0.967076,0.966802,0.96733,0.967356,0.966495
3,0.984309,0.982783,0.966835,1.0,0.988887,0.988818,0.988332,0.987636,0.985232
4,0.986109,0.984204,0.967076,0.988887,1.0,0.991313,0.988786,0.988367,0.98589
5,0.986171,0.983849,0.966802,0.988818,0.991313,1.0,0.988896,0.988597,0.986044
6,0.986365,0.984478,0.96733,0.988332,0.988786,0.988896,1.0,0.988976,0.986219
7,0.987286,0.985146,0.967356,0.987636,0.988367,0.988597,0.988976,1.0,0.986476
8,0.986377,0.982572,0.966495,0.985232,0.98589,0.986044,0.986219,0.986476,1.0


In [10]:
lin_meta = LinearRegression()
rmse = []
X_meta2 = np.zeros((len(X_meta),5))
X_test2 = np.zeros((len(X_test),5))
predict_test_kfolds = []
for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    lin_meta.fit(X_meta[tr_idx],y[tr_idx])
    predictions = lin_meta.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,0]=predictions
    predict_test = lin_meta.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

0.21448328095727134


In [11]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.4f}_'+'stack_avg_lm.csv',index=False)
X_test2[:,0] = kfold_predictions

In [12]:
lin_meta.coef_

array([-0.03550026,  0.07920942,  0.20101695,  0.18661057,  0.10741262,
        0.12893479,  0.06830707,  0.03643806,  0.27265837])

In [13]:
rmse = []
predict_test_kfolds = []

lgb_params = {'learning_rate':0.01,
              'task':'train',
              'boosting_type':'gbdt',
              'metric':'rmse',
              'objective':'regression',
              'num_leaves':30,
              'max_depth':5,
              'min_data_in_leaf':20, #Defaut 20
              'feature_fraction': 1,
              'feature_fraction_seed':0,
              'bagging_fraction': 1,
              #'bagging_freq': 2,
              'bagging_seed':0,
              'verbose':1,
              'num_threads':4 #Put to 4 if you are leaving computer
              }

for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    lgtrain = lgb.Dataset(X_meta[tr_idx],y[tr_idx])
    lgvalid = lgb.Dataset(X_meta[val_idx],y[val_idx])

    lgb_meta = lgb.train(lgb_params,lgtrain,valid_sets=[lgtrain, lgvalid],valid_names=['train','valid'],
                          verbose_eval=200,num_boost_round=5000,early_stopping_rounds=10)
    predictions = lgb_meta.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,1]=predictions
    
    predict_test = lgb_meta.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
    
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215222	valid's rmse: 0.215112
[400]	train's rmse: 0.214205	valid's rmse: 0.214237
Early stopping, best iteration is:
[494]	train's rmse: 0.214153	valid's rmse: 0.214225
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215151	valid's rmse: 0.215774
[400]	train's rmse: 0.214132	valid's rmse: 0.21487
Early stopping, best iteration is:
[560]	train's rmse: 0.214052	valid's rmse: 0.214853
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215196	valid's rmse: 0.215439
[400]	train's rmse: 0.214178	valid's rmse: 0.214512
Early stopping, best iteration is:
[503]	train's rmse: 0.214119	valid's rmse: 0.214496
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215173	valid's rmse: 0.215736
[400]	train's rmse: 0.214157	valid's rmse: 0.214684
[600]	train's rmse: 0.214063	valid's rmse: 0.214645
Early stopping, best ite

In [14]:
lgb_meta.feature_importance()

array([ 826, 1320, 2853, 1474, 1066, 1116,  967, 1072, 1928])

In [15]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.5f}_'+'Stack_LGB_10Fold.csv',index=False)
X_test2[:,1] = kfold_predictions

In [16]:
import keras
def rms(y, y_pred):
    return keras.backend.sqrt(keras.backend.mean(keras.backend.square(y_pred - y)))*10

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [17]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras_tqdm import TQDMNotebookCallback

def make_nn():
    
    inp=Input((X_meta.shape[1],))
    main = Dense(16,activation='tanh')(inp)
    main = Dropout(0.2)(main)
    main = Dense(8,activation='tanh')(main)
    y_hat = Dense(1,activation='sigmoid')(main)

    nn_model = Model(inputs=inp,outputs=y_hat)
    return nn_model

In [18]:
epochs = 2
rmse = []
predict_test_kfolds = []
for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    keras.backend.clear_session() #Reset   
    nn_model = make_nn()
    nn_model.compile('Adam',loss='mean_squared_error',metrics=[rms])
    nn_model.fit(X_meta[tr_idx],y[tr_idx],epochs=epochs,validation_data=(X_meta[val_idx],y[val_idx]),
                batch_size=512,verbose = 0,callbacks=[TQDMNotebookCallback(leave_inner=True)])
    predictions = nn_model.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,2]=predictions.flatten()
    predict_test = nn_model.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)

HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353081), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353081), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353081), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353081), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353082), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353082), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353082), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353082), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353082), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353082), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353082), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353082), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353082), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353082), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353082), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353082), HTML(value='')))




In [19]:
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.4f}_'+'stack_avg_nn.csv',index=False)
X_test2[:,2] = kfold_predictions.flatten()

0.21467797982009534


In [20]:
rmse = []
predict_test_kfolds = []

lgb_params = {'learning_rate':0.015,
              'task':'train',
              'boosting_type':'gbdt',
              'metric':'rmse',
              'objective':'regression',
              'num_leaves':100,
              'max_depth':7,
              'min_data_in_leaf':20, #Defaut 20
              'feature_fraction': 1,
              'feature_fraction_seed':1,
              'bagging_fraction': 1,
              #'bagging_freq': 2,
              'bagging_seed':0,
              'verbose':1,
              'num_threads':4 #Put to 4 if you are leaving computer
              }

for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    lgtrain = lgb.Dataset(X_meta[tr_idx],y[tr_idx])
    lgvalid = lgb.Dataset(X_meta[val_idx],y[val_idx])

    lgb_meta = lgb.train(lgb_params,lgtrain,valid_sets=[lgtrain, lgvalid],valid_names=['train','valid'],
                          verbose_eval=200,num_boost_round=5000,early_stopping_rounds=10)
    predictions = lgb_meta.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,3]=predictions
    
    predict_test = lgb_meta.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
    
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.4f}_'+'stack_avg_lgb2.csv',index=False)
X_test2[:,3] = kfold_predictions

Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.214015	valid's rmse: 0.214336
Early stopping, best iteration is:
[336]	train's rmse: 0.213683	valid's rmse: 0.214237
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.213941	valid's rmse: 0.214985
Early stopping, best iteration is:
[296]	train's rmse: 0.213668	valid's rmse: 0.214878
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.213985	valid's rmse: 0.214632
Early stopping, best iteration is:
[282]	train's rmse: 0.21374	valid's rmse: 0.214532
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.213961	valid's rmse: 0.214844
Early stopping, best iteration is:
[367]	train's rmse: 0.213595	valid's rmse: 0.214681
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.214038	valid's rmse: 0.214162
Early stopping, best iteration is:
[329]	train's rmse: 0.213718	valid's rmse: 0.214

In [21]:
rmse = []
predict_test_kfolds = []

lgb_params = {'learning_rate':0.005,
              'task':'train',
              'boosting_type':'gbdt',
              'metric':'rmse',
              'objective':'regression',
              'num_leaves':300,
              'max_depth':7,
              'min_data_in_leaf':15, #Defaut 20
              'feature_fraction': 1,
              'feature_fraction_seed':1,
              'bagging_fraction': 1,
              #'bagging_freq': 2,
              'bagging_seed':0,
              'verbose':1,
              'num_threads':4 #Put to 4 if you are leaving computer
              }

for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    lgtrain = lgb.Dataset(X_meta[tr_idx],y[tr_idx])
    lgvalid = lgb.Dataset(X_meta[val_idx],y[val_idx])

    lgb_meta = lgb.train(lgb_params,lgtrain,valid_sets=[lgtrain, lgvalid],valid_names=['train','valid'],
                          verbose_eval=200,num_boost_round=5000,early_stopping_rounds=10)
    predictions = lgb_meta.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,4]=predictions
    
    predict_test = lgb_meta.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
    
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.4f}_'+'stack_avg_lgb3.csv',index=False)
X_test2[:,4] = kfold_predictions

## Meta level 2

In [23]:
lin_meta2 = LinearRegression()
predict_test_kfolds = []
rmse = []
for tr_idx, val_idx in KFold(5,random_state=seed,shuffle=True).split(X_meta2):
    lin_meta2.fit(X_meta2[tr_idx],y[tr_idx])
    predictions = lin_meta2.predict(X_meta2[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    predict_test = lin_meta2.predict(X_test2).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

0.21437074253284544


In [24]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.5f}_'+'stack_avg_lm_level2.csv',index=False)

In [25]:
lin_meta2.coef_

array([0.25986902, 0.38675334, 0.11587541, 0.10167575, 0.14756954])

In [26]:
rmse = []
predict_test_kfolds = []

lgb_params = {'learning_rate':0.01,
              'task':'train',
              'boosting_type':'gbdt',
              'metric':'rmse',
              'objective':'regression',
              'num_leaves':30,
              'max_depth':5,
              'min_data_in_leaf':20, #Defaut 20
              'feature_fraction': 1,
              'feature_fraction_seed':0,
              'bagging_fraction': 1,
              #'bagging_freq': 2,
              'bagging_seed':0,
              'verbose':1,
              'num_threads':4 #Put to 4 if you are leaving computer
              }

for tr_idx, val_idx in KFold(5,random_state=seed,shuffle=True).split(X_meta2):
    lgtrain = lgb.Dataset(X_meta2[tr_idx],y[tr_idx])
    lgvalid = lgb.Dataset(X_meta2[val_idx],y[val_idx])

    lgb_meta2 = lgb.train(lgb_params,lgtrain,valid_sets=[lgtrain, lgvalid],valid_names=['train','valid'],
                          verbose_eval=200,num_boost_round=5000,early_stopping_rounds=10)
    predictions = lgb_meta2.predict(X_meta2[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    predict_test = lgb_meta2.predict(X_test2).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
    
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.21521	valid's rmse: 0.215451
[400]	train's rmse: 0.214266	valid's rmse: 0.214575
Early stopping, best iteration is:
[536]	train's rmse: 0.214222	valid's rmse: 0.214561
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215194	valid's rmse: 0.215566
[400]	train's rmse: 0.214252	valid's rmse: 0.214619
Early stopping, best iteration is:
[570]	train's rmse: 0.214198	valid's rmse: 0.214601
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215257	valid's rmse: 0.215297
[400]	train's rmse: 0.214317	valid's rmse: 0.214357
Early stopping, best iteration is:
[538]	train's rmse: 0.214271	valid's rmse: 0.214338
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215278	valid's rmse: 0.215147
[400]	train's rmse: 0.214339	valid's rmse: 0.214277
Early stopping, best iteration is:
[511]	train's rmse: 0.214302	valid's rmse

In [27]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.5f}_'+'stack_avg_lgb_level2.csv',index=False)