In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import lightgbm as lgb

In [2]:
DATA_PATH = 'E:/Kaggle/Avito/'
seed = 32

In [3]:
train_files = ['Meta_LGB1.csv','Meta_LGBSVD1.csv','Meta_LGBRidge.csv']
test_files = ['0.2155_Predictions_LGB_v2.csv','0.2156_Predictions_LGBSVD_v2.csv',
              '0.2163_Predictions_LGBRidge.csv']

In [4]:
train = pd.read_csv(DATA_PATH+'train.csv')
test = pd.read_csv(DATA_PATH+'test.csv')
y = train['deal_probability']

In [5]:
frames = []
for f in train_files:
    frames.append(pd.read_csv(DATA_PATH+'Stacking/'+f)[:len(train)].iloc[:,1:2].values)

In [6]:
X_meta = np.concatenate(frames,axis=1)

In [7]:
frames = []
for f in test_files:
    frames.append(pd.read_csv(DATA_PATH+'Stacking/'+f)[:len(train)].iloc[:,1:2].values)

In [8]:
X_test = np.concatenate(frames,axis=1)

In [9]:
pd.DataFrame(X_meta).corr()

Unnamed: 0,0,1,2
0,1.0,0.984227,0.966973
1,0.984227,1.0,0.965924
2,0.966973,0.965924,1.0


In [10]:
lin_meta = LinearRegression()
rmse = []
X_meta2 = np.zeros((len(X_meta),2))
X_test2 = np.zeros((len(X_test),2))
predict_test_kfolds = []
for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    lin_meta.fit(X_meta[tr_idx],y[tr_idx])
    predictions = lin_meta.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,0]=predictions
    predict_test = lin_meta.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

0.21483301634798035


In [11]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.4f}_'+'stack_avg_lm.csv',index=False)
X_test2[:,0] = kfold_predictions

In [12]:
lin_meta.coef_

array([0.36895613, 0.3650739 , 0.30453523])

In [13]:
rmse = []
predict_test_kfolds = []

lgb_params = {'learning_rate':0.01,
              'task':'train',
              'boosting_type':'gbdt',
              'metric':'rmse',
              'objective':'regression',
              'num_leaves':30,
              'max_depth':5,
              'min_data_in_leaf':20, #Defaut 20
              'feature_fraction': 1,
              'feature_fraction_seed':0,
              'bagging_fraction': 1,
              #'bagging_freq': 2,
              'bagging_seed':0,
              'verbose':1,
              'num_threads':4 #Put to 4 if you are leaving computer
              }

for tr_idx, val_idx in KFold(10,random_state=seed,shuffle=True).split(X_meta):
    lgtrain = lgb.Dataset(X_meta[tr_idx],y[tr_idx])
    lgvalid = lgb.Dataset(X_meta[val_idx],y[val_idx])

    lgb_meta = lgb.train(lgb_params,lgtrain,valid_sets=[lgtrain, lgvalid],valid_names=['train','valid'],
                          verbose_eval=200,num_boost_round=5000,early_stopping_rounds=10)
    predictions = lgb_meta.predict(X_meta[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    X_meta2[val_idx,1]=predictions
    
    predict_test = lgb_meta.predict(X_test).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
    
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215618	valid's rmse: 0.215622
[400]	train's rmse: 0.214655	valid's rmse: 0.214741
Early stopping, best iteration is:
[497]	train's rmse: 0.21462	valid's rmse: 0.214732
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215549	valid's rmse: 0.216277
[400]	train's rmse: 0.214587	valid's rmse: 0.215361
Early stopping, best iteration is:
[541]	train's rmse: 0.214539	valid's rmse: 0.215345
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215607	valid's rmse: 0.215733
[400]	train's rmse: 0.214643	valid's rmse: 0.214835
Early stopping, best iteration is:
[491]	train's rmse: 0.214609	valid's rmse: 0.214825
Training until validation scores don't improve for 10 rounds.
[200]	train's rmse: 0.215582	valid's rmse: 0.216083
[400]	train's rmse: 0.214618	valid's rmse: 0.215063
[600]	train's rmse: 0.214562	valid's rmse: 0.215035
Early stopping, best ite

In [14]:
lgb_meta.feature_importance()

array([3854, 3767, 4587])

In [15]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.5f}_'+'Stack_LGB_10Fold.csv',index=False)
X_test2[:,1] = kfold_predictions

## Meta level 2

In [16]:
lin_meta2 = LinearRegression()
predict_test_kfolds = []
rmse = []
for tr_idx, val_idx in KFold(5,random_state=seed,shuffle=True).split(X_meta):
    lin_meta2.fit(X_meta2[tr_idx],y[tr_idx])
    predictions = lin_meta2.predict(X_meta2[val_idx]).clip(0.0,1.0)
    r = mean_squared_error(y[val_idx],predictions)**0.5
    rmse.append(r)
    predict_test = lin_meta2.predict(X_test2).clip(0.0,1.0)
    predict_test_kfolds.append(predict_test)
avg_rmse = sum(rmse)/len(rmse)
print(avg_rmse)

0.21475979383930838


In [17]:
kfold_predictions = np.stack(predict_test_kfolds).mean(axis=0)
test['deal_probability'] = kfold_predictions
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{avg_rmse:.5f}_'+'stack_avg_lm_level2.csv',index=False)

In [18]:
lin_meta2.coef_

array([0.36319288, 0.64295954])