In [13]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output
import itertools
from tqdm import tqdm 
from evaluation_metric import lgb_amex_metric
import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb

class CFG:
    input_dir = 'Data/'
    seed = 42
    n_folds = 5
    target = 'target'
    path = 'test/'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def save_model(fold):
    def callback(env):
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if score > score_dic[fold]:
            score_dic[fold] = score

    callback.order = 0
    return callback

In [2]:
train = pd.read_parquet(CFG.input_dir + 'train_all_slopes_corr_pcaslope.parquet')
labels = pd.read_pickle('Data/train_labels.pkl').loc[train.index]
train['target'] = labels

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

cat_features = [f"{cf}_last" for cf in cat_features]
train.shape, labels.shape

((458913, 2508), (458913, 1))

In [3]:
top_corr = ['corr_D_39-B_26', 'corr_D_48-B_4', 'corr_P_2-D_44',
       'corr_D_47-B_4', 'corr_D_47-D_39', 'corr_P_2-B_4',
       'corr_D_39-B_10', 'corr_D_44-B_4', 'corr_D_39-B_2',
       'corr_D_46-B_4', 'corr_D_48-D_47', 'corr_D_48-B_3',
       'corr_D_48-B_9', 'corr_S_5-S_24', 'corr_S_7-S_3',
       'corr_D_43-D_144', 'corr_D_48-D_39', 'corr_D_44-B_3',
       'corr_P_3-D_46', 'corr_S_5-D_43', 'corr_R_1-B_4', 
       'corr_P_3-D_47', 'corr_D_39-B_3', 'corr_R_6-D_39', 'corr_S_27-B_2',
       'corr_S_23-D_43', 'corr_R_6-D_69', 'corr_P_2-D_48',
       'corr_S_25-B_4', 'corr_D_43-B_4', 'corr_R_27-D_69',
       'corr_S_7-S_27', 'corr_D_39-B_11', 'corr_S_3-D_39',
       'corr_D_39-B_1', 'corr_S_12-B_4', 'corr_D_39-B_15',
       'corr_R_27-B_26', 'corr_S_23-D_39', 'corr_R_27-R_1',
       'corr_R_1-D_39', 'corr_S_19-D_39', 'corr_S_27-B_3',
       'corr_S_16-D_39', 'corr_R_27-B_5',
       'corr_S_3-D_62', 'corr_D_71-D_62', 'corr_R_27-D_39']

In [18]:
def train_and_evaluate(train_data, parameters):
    
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_data, train_data[CFG.target])):
        if fold in [1]:

            features = [col for col in train_data.columns if col not in ['target']]
            x_train, x_val = train_data[features].iloc[trn_ind], train_data[features].iloc[val_ind]
            y_train, y_val = train_data[CFG.target].iloc[trn_ind], train_data[CFG.target].iloc[val_ind]
            lgb_train = lgb.Dataset(x_train, y_train)
            lgb_valid = lgb.Dataset(x_val, y_val)
            del x_train, x_val, y_train, y_val; gc.collect()
            
            
            model = lgb.train(
                params = parameters,
                train_set = lgb_train,
                num_boost_round = 300,
                valid_sets = [lgb_valid],
                feval = lgb_amex_metric,
                callbacks=[save_model(fold)],
                )

In [5]:
params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'seed': [42],
    'feature_fraction': [1],
    'verbosity': [-1],
}

In [15]:
corr_list = []
score_list = []
grid  = list(ParameterGrid(params))
len_grid = len(grid)
for run, parameters in enumerate(grid):
    set_of_combinations = itertools.combinations(top_corr[1:15], 5)
    for subset in tqdm(list(set_of_combinations)):

        score_dic = {
            0:0.2,
            1:0.2,
            2:0.2,
            3:0.2,
            4:0.2,
        }

        subset = list(subset)
        subset.append('corr_D_39-B_26')
        train_data = train[subset]
        train_data['target'] = labels
        train_and_evaluate(train_data, parameters)
        corr_list.append(str(subset))
        score_list.append(score_dic[1])
        clear_output(wait=True)

  0%|          | 8/2002 [01:42<7:07:38, 12.87s/it]


KeyboardInterrupt: 

In [16]:
score_df = pd.DataFrame({'features': corr_list, 'score': score_list}).sort_values('score', ascending=False)
print(score_df.head(5)['features'].values)
score_df

["['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-B_4', 'corr_D_47-D_39', 'corr_D_48-B_3', 'corr_D_39-B_26']"
 "['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-B_4', 'corr_D_47-D_39', 'corr_D_46-B_4', 'corr_D_39-B_26']"
 "['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-B_4', 'corr_D_47-D_39', 'corr_D_39-B_2', 'corr_D_39-B_26']"
 "['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-B_4', 'corr_D_47-D_39', 'corr_D_44-B_4', 'corr_D_39-B_26']"
 "['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-B_4', 'corr_D_47-D_39', 'corr_P_2-B_4', 'corr_D_39-B_26']"]


Unnamed: 0,features,score
6,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.530691
4,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.529039
3,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.527045
2,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.526846
0,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.526775
7,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.525788
5,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.523087
1,"['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_D_47-...",0.519557


In [19]:
score_dic = {
    0:0.2,
    1:0.2,
    2:0.2,
    3:0.2,
    4:0.2,
}

train_data = train[['corr_D_47-B_4', 'corr_P_2-B_4', 'corr_D_44-B_4', 'corr_D_48-B_3', 'corr_D_39-B_26']]
train_data['target'] = labels
train_and_evaluate(train_data, parameters)
score_dic[1]

0.5315035344311915

In [20]:
score_dic = {
    0:0.2,
    1:0.2,
    2:0.2,
    3:0.2,
    4:0.2,
}

train_data = train[['corr_D_48-B_4', 'corr_P_2-D_44', 'corr_S_5-S_24', 'corr_D_48-B_9', 'corr_D_39-B_26']]
train_data['target'] = labels
train_and_evaluate(train_data, parameters)
score_dic[1]

0.49963919803957824