In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import extract_features
np.warnings.filterwarnings('ignore')
import dask.dataframe as dd
import missingno as msno
from pandasql import sqldf
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold
import matplotlib.gridspec as gridspec
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
#Always seed the randomness of this universe
np.random.seed(51)

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
%%time
train_metadata_kaggle = dd.read_csv('mydata_train_metadata.csv')
test_metadata_kaggle = dd.read_csv('mydata_test_metadata.csv')
train_metadata_kaggle = train_metadata_kaggle.compute()
test_metadata_kaggle = test_metadata_kaggle.compute()
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

(7848, 139) (3492890, 138)
CPU times: user 3min 20s, sys: 11.8 s, total: 3min 32s
Wall time: 38.4 s


In [5]:
%%time
test_metadata_kaggle = test_metadata_kaggle.reset_index(drop=True)

CPU times: user 2.5 s, sys: 3.39 s, total: 5.89 s
Wall time: 5.88 s


In [8]:
test_id = test_metadata_kaggle['object_id']

In [9]:
def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False

# MAIN TRAINING

In [14]:
%%time
final_dict = {}

loss_list = []
temp = train_metadata_kaggle.copy()
#temp = temp.merge(train_metadata[['object_id'] + used_columns1],on = 'object_id',how = 'left')
print(temp.shape)
#temp = temp.merge(train_metadata[['object_id',column_]],on = 'object_id',how = 'left')
y = temp['target']
del temp['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

#print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
#'max_depth': 7, 
'n_estimators': 1000, 
'subsample_freq': 2, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 10,
'min_child_weight': 200.0, 
#'min_child_weight': 100.0, 
'min_split_gain': 0.1, 
'num_leaves': 7, 
#'reg_alpha': 0.1,
'reg_alpha': 0.0, 
'reg_lambda': 0.00023, 
'skip_drop': 0.44, 
'subsample': 0.75}
oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=False,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
    #loss_list.append(loss_oof)
    print(fold_,loss_oof)

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
#final_dict[column_] = loss_list

(7848, 139)
0 0.5846232690590423
1 0.5598478406280183
2 0.6060594241195905
3 0.5551329424743721
4 0.5738283897556737
MULTI WEIGHTED LOG LOSS : 0.57588 
CPU times: user 11min 34s, sys: 1.73 s, total: 11min 36s
Wall time: 1min 34s


# PARAMETER TUNING

In [16]:
final_dict = {}
parameter_tuned = 'subsample_freq'
for i in [2,3,4,5,6]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 1000, 
    'subsample_freq': i, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 10,
    'min_child_weight': 200.0, 
    #'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,6))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

(7848, 139)
2 [0.5846232690590423, 0.5598478406280183, 0.6060594241195905, 0.5551329424743721, 0.5738283897556737]
MULTI WEIGHTED LOG LOSS : 0.57588 
(7848, 139)
3 [0.5812347980144251, 0.558722069534653, 0.6094043217820466, 0.5643351744809683, 0.57406946929533]
MULTI WEIGHTED LOG LOSS : 0.57754 
(7848, 139)
4 [0.5795408254189881, 0.5621103145857294, 0.6087282931399269, 0.5634614254711744, 0.567038047228325]
MULTI WEIGHTED LOG LOSS : 0.57616 
(7848, 139)
5 [0.5818531991668668, 0.5566890683011684, 0.6086201149905734, 0.5614331255465247, 0.5663315357166874]
MULTI WEIGHTED LOG LOSS : 0.57500 
(7848, 139)
6 [0.577211069460461, 0.5579433752331334, 0.6021804408687527, 0.5656735512550161, 0.5719369423290517]
MULTI WEIGHTED LOG LOSS : 0.57495 


In [19]:
print(pd.Series(final_dict))

2    [0.5846232690590423, 0.5598478406280183, 0.606...
3    [0.5812347980144251, 0.558722069534653, 0.6094...
4    [0.5795408254189881, 0.5621103145857294, 0.608...
5    [0.5818531991668668, 0.5566890683011684, 0.608...
6    [0.577211069460461, 0.5579433752331334, 0.6021...
dtype: object

In [20]:
%%time
final_dict = {}
parameter_tuned = 'subsample_freq'
for i in [2000,3000,4000,5000,6000]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 1000, 
    'subsample_freq': 6, 
    'subsample_for_bin': i, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 10,
    'min_child_weight': 200.0, 
    #'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

2000 [0.583214, 0.564629, 0.6046, 0.5667, 0.56794]
MULTI WEIGHTED LOG LOSS : 0.57747 
3000 [0.580759, 0.561566, 0.60691, 0.567888, 0.569603]
MULTI WEIGHTED LOG LOSS : 0.57735 
4000 [0.583355, 0.560267, 0.612454, 0.564482, 0.568709]
MULTI WEIGHTED LOG LOSS : 0.57788 
5000 [0.577211, 0.557943, 0.60218, 0.565674, 0.571937]
MULTI WEIGHTED LOG LOSS : 0.57495 
6000 [0.580719, 0.558598, 0.608918, 0.571142, 0.568037]
MULTI WEIGHTED LOG LOSS : 0.57748 
subsample_freq_2000        [0.583214, 0.564629, 0.6046, 0.5667, 0.56794]
subsample_freq_3000    [0.580759, 0.561566, 0.60691, 0.567888, 0.569603]
subsample_freq_4000    [0.583355, 0.560267, 0.612454, 0.564482, 0.568...
subsample_freq_5000    [0.577211, 0.557943, 0.60218, 0.565674, 0.571937]
subsample_freq_6000    [0.580719, 0.558598, 0.608918, 0.571142, 0.568...
dtype: object


In [21]:
%%time
final_dict = {}
parameter_tuned = 'metric_freq'
for i in [1,5,10,20,50]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 1000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 10,
    'min_child_weight': 200.0, 
    #'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

1 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
5 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
10 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
20 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
50 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
metric_freq_1     [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
metric_freq_5     [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
metric_freq_10    [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
metric_freq_20    [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
metric_freq_50    [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
dtype: object
CPU times: user 1h 1min 57s, sys: 12.4 s, total: 1h 2min 9s
Wall time: 8min 36s


In [23]:
%%time
final_dict = {}
parameter_tuned = 'colsample_bytree'
for i in [0.4,0.5,0.6,0.7,0.8,0.9]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 1000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': i, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 10,
    'min_child_weight': 200.0, 
    #'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.4 [0.58174, 0.55887, 0.60791, 0.56649, 0.57043]
MULTI WEIGHTED LOG LOSS : 0.57711 
0.5 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
0.6 [0.57435, 0.5588, 0.61011, 0.56269, 0.57587]
MULTI WEIGHTED LOG LOSS : 0.57634 
0.7 [0.57602, 0.5622, 0.61196, 0.56568, 0.57914]
MULTI WEIGHTED LOG LOSS : 0.57896 
0.8 [0.5767, 0.56184, 0.61859, 0.56897, 0.58117]
MULTI WEIGHTED LOG LOSS : 0.58139 
0.9 [0.57998, 0.56492, 0.61784, 0.57321, 0.58108]
MULTI WEIGHTED LOG LOSS : 0.58336 
colsample_bytree_0.4    [0.58174, 0.55887, 0.60791, 0.56649, 0.57043]
colsample_bytree_0.5    [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
colsample_bytree_0.6     [0.57435, 0.5588, 0.61011, 0.56269, 0.57587]
colsample_bytree_0.7     [0.57602, 0.5622, 0.61196, 0.56568, 0.57914]
colsample_bytree_0.8     [0.5767, 0.56184, 0.61859, 0.56897, 0.58117]
colsample_bytree_0.9    [0.57998, 0.56492, 0.61784, 0.57321, 0.58108]
dtype: object
CPU times: user 1h 23min 14s, sys: 17.1 s, total: 1h 23min 

In [26]:
%%time
final_dict = {}
#0.0267
parameter_tuned = 'learning_rate'
for i in [0.010,0.015,0.020,0.025,0.030,0.035]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': i, 
    'max_drop': 5, 
    'min_child_samples': 10,
    'min_child_weight': 200.0, 
    #'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.01 [0.57821, 0.56288, 0.61278, 0.56149, 0.57077]
MULTI WEIGHTED LOG LOSS : 0.57722 
0.015 [0.57744, 0.56311, 0.60913, 0.56052, 0.56951]
MULTI WEIGHTED LOG LOSS : 0.57597 
0.02 [0.58188, 0.55953, 0.61009, 0.56367, 0.57297]
MULTI WEIGHTED LOG LOSS : 0.57764 
0.025 [0.57482, 0.55745, 0.60672, 0.57016, 0.57165]
MULTI WEIGHTED LOG LOSS : 0.57609 
0.03 [0.57289, 0.56307, 0.60392, 0.56798, 0.57133]
MULTI WEIGHTED LOG LOSS : 0.57579 
0.035 [0.57773, 0.56208, 0.6028, 0.56419, 0.57361]
MULTI WEIGHTED LOG LOSS : 0.57605 
learning_rate_0.01     [0.57821, 0.56288, 0.61278, 0.56149, 0.57077]
learning_rate_0.015    [0.57744, 0.56311, 0.60913, 0.56052, 0.56951]
learning_rate_0.02     [0.58188, 0.55953, 0.61009, 0.56367, 0.57297]
learning_rate_0.025    [0.57482, 0.55745, 0.60672, 0.57016, 0.57165]
learning_rate_0.03     [0.57289, 0.56307, 0.60392, 0.56798, 0.57133]
learning_rate_0.035     [0.57773, 0.56208, 0.6028, 0.56419, 0.57361]
dtype: object
CPU times: user 1h 38min 36s, sys: 21.5 s, total: 1h 3

In [27]:
%%time
final_dict = {}
#10
parameter_tuned = 'min_child_samples'
for i in [5,10,20,30,40]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    #'min_child_samples': 10,
    'min_child_samples': i,
    'min_child_weight': 200.0, 
    #'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

5 [0.57613, 0.55804, 0.60028, 0.56498, 0.57032]
MULTI WEIGHTED LOG LOSS : 0.57390 
10 [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
MULTI WEIGHTED LOG LOSS : 0.57495 
20 [0.57721, 0.56012, 0.59885, 0.5658, 0.56983]
MULTI WEIGHTED LOG LOSS : 0.57433 
30 [0.57787, 0.55954, 0.60001, 0.56433, 0.56959]
MULTI WEIGHTED LOG LOSS : 0.57423 
40 [0.5773, 0.55988, 0.5937, 0.56723, 0.57061]
MULTI WEIGHTED LOG LOSS : 0.57372 
min_child_samples_5     [0.57613, 0.55804, 0.60028, 0.56498, 0.57032]
min_child_samples_10    [0.57721, 0.55794, 0.60218, 0.56567, 0.57194]
min_child_samples_20     [0.57721, 0.56012, 0.59885, 0.5658, 0.56983]
min_child_samples_30    [0.57787, 0.55954, 0.60001, 0.56433, 0.56959]
min_child_samples_40      [0.5773, 0.55988, 0.5937, 0.56723, 0.57061]
dtype: object
CPU times: user 48min 3s, sys: 7 s, total: 48min 10s
Wall time: 6min 17s


In [28]:
%%time
final_dict = {}
#200
parameter_tuned = 'min_child_weight'
for i in [100,150,200,250,300]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    #'min_child_samples': 10,
    'min_child_samples': 40,
    'min_child_weight': i, 
    #'min_child_weight': 200.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    #'reg_alpha': 0.1,
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

100 [0.58603, 0.55264, 0.60075, 0.56095, 0.56961]
MULTI WEIGHTED LOG LOSS : 0.57397 
150 [0.57981, 0.55451, 0.59484, 0.56493, 0.56796]
MULTI WEIGHTED LOG LOSS : 0.57238 
200 [0.5773, 0.55988, 0.5937, 0.56723, 0.57061]
MULTI WEIGHTED LOG LOSS : 0.57372 
250 [0.58225, 0.56556, 0.601, 0.56785, 0.5732]
MULTI WEIGHTED LOG LOSS : 0.57797 
300 [0.58125, 0.57181, 0.60151, 0.57276, 0.57158]
MULTI WEIGHTED LOG LOSS : 0.57982 
min_child_weight_100    [0.58603, 0.55264, 0.60075, 0.56095, 0.56961]
min_child_weight_150    [0.57981, 0.55451, 0.59484, 0.56493, 0.56796]
min_child_weight_200      [0.5773, 0.55988, 0.5937, 0.56723, 0.57061]
min_child_weight_250       [0.58225, 0.56556, 0.601, 0.56785, 0.5732]
min_child_weight_300    [0.58125, 0.57181, 0.60151, 0.57276, 0.57158]
dtype: object
CPU times: user 41min 5s, sys: 5.6 s, total: 41min 10s
Wall time: 5min 12s


In [29]:
%%time
final_dict = {}
#200
parameter_tuned = 'min_split_gain'
for i in [0.05,0.1,0.15,0.20]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': i, 
    'num_leaves': 7, 
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.05 [0.5815, 0.55448, 0.594, 0.56581, 0.56776]
MULTI WEIGHTED LOG LOSS : 0.57269 
0.1 [0.57981, 0.55451, 0.59484, 0.56493, 0.56796]
MULTI WEIGHTED LOG LOSS : 0.57238 
0.15 [0.58143, 0.55315, 0.59422, 0.5652, 0.56725]
MULTI WEIGHTED LOG LOSS : 0.57223 
0.2 [0.58367, 0.55381, 0.59515, 0.56415, 0.56677]
MULTI WEIGHTED LOG LOSS : 0.57270 
min_split_gain_0.05       [0.5815, 0.55448, 0.594, 0.56581, 0.56776]
min_split_gain_0.1     [0.57981, 0.55451, 0.59484, 0.56493, 0.56796]
min_split_gain_0.15     [0.58143, 0.55315, 0.59422, 0.5652, 0.56725]
min_split_gain_0.2     [0.58367, 0.55381, 0.59515, 0.56415, 0.56677]
dtype: object
CPU times: user 49min 52s, sys: 9.29 s, total: 50min 1s
Wall time: 6min 55s


In [31]:
%%time
final_dict = {}
#7
parameter_tuned = 'num_leaves'
for i in [7,15,25,35,45,65]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': i, 
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

7 [0.58143, 0.55315, 0.59422, 0.5652, 0.56725]
MULTI WEIGHTED LOG LOSS : 0.57223 
15 [0.59159, 0.58486, 0.61192, 0.58034, 0.58697]
MULTI WEIGHTED LOG LOSS : 0.59112 
25 [0.59974, 0.59089, 0.62314, 0.58964, 0.59799]
MULTI WEIGHTED LOG LOSS : 0.60024 
35 [0.59917, 0.58728, 0.62374, 0.59459, 0.59852]
MULTI WEIGHTED LOG LOSS : 0.60064 
45 [0.60018, 0.58814, 0.62054, 0.59383, 0.59798]
MULTI WEIGHTED LOG LOSS : 0.60009 
65 [0.60018, 0.58814, 0.62054, 0.59383, 0.59798]
MULTI WEIGHTED LOG LOSS : 0.60009 
num_leaves_7      [0.58143, 0.55315, 0.59422, 0.5652, 0.56725]
num_leaves_15    [0.59159, 0.58486, 0.61192, 0.58034, 0.58697]
num_leaves_25    [0.59974, 0.59089, 0.62314, 0.58964, 0.59799]
num_leaves_35    [0.59917, 0.58728, 0.62374, 0.59459, 0.59852]
num_leaves_45    [0.60018, 0.58814, 0.62054, 0.59383, 0.59798]
num_leaves_65    [0.60018, 0.58814, 0.62054, 0.59383, 0.59798]
dtype: object
CPU times: user 48min 37s, sys: 9.52 s, total: 48min 47s
Wall time: 6min 9s


In [32]:
%%time
final_dict = {}
#7
parameter_tuned = 'num_leaves'
for i in [5,7,9,11]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': i, 
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

5 [0.57984, 0.55343, 0.59763, 0.56107, 0.5707]
MULTI WEIGHTED LOG LOSS : 0.57253 
7 [0.58143, 0.55315, 0.59422, 0.5652, 0.56725]
MULTI WEIGHTED LOG LOSS : 0.57223 
9 [0.57986, 0.56351, 0.59991, 0.57124, 0.57725]
MULTI WEIGHTED LOG LOSS : 0.57834 
11 [0.58753, 0.57099, 0.60249, 0.57121, 0.58181]
MULTI WEIGHTED LOG LOSS : 0.58280 
num_leaves_5      [0.57984, 0.55343, 0.59763, 0.56107, 0.5707]
num_leaves_7      [0.58143, 0.55315, 0.59422, 0.5652, 0.56725]
num_leaves_9     [0.57986, 0.56351, 0.59991, 0.57124, 0.57725]
num_leaves_11    [0.58753, 0.57099, 0.60249, 0.57121, 0.58181]
dtype: object
CPU times: user 31min 53s, sys: 4.51 s, total: 31min 57s
Wall time: 4min 2s


In [33]:
%%time
final_dict = {}
#7
parameter_tuned = 'num_leaves'
for i in [6,8]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': i, 
    'reg_alpha': 0.0, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

6 [0.57999, 0.54996, 0.59388, 0.56439, 0.56667]
MULTI WEIGHTED LOG LOSS : 0.57099 
8 [0.581, 0.5564, 0.59821, 0.56681, 0.57524]
MULTI WEIGHTED LOG LOSS : 0.57551 
num_leaves_6    [0.57999, 0.54996, 0.59388, 0.56439, 0.56667]
num_leaves_8       [0.581, 0.5564, 0.59821, 0.56681, 0.57524]
dtype: object
CPU times: user 16min 33s, sys: 2.19 s, total: 16min 35s
Wall time: 2min 5s


In [35]:
%%time
final_dict = {}
#7
parameter_tuned = 'reg_alpha'
for i in [0.0,0.1,0.5,1,5,10,100]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': 6, 
    'reg_alpha': i, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.0 [0.57999, 0.54996, 0.59388, 0.56439, 0.56667]
MULTI WEIGHTED LOG LOSS : 0.57099 
0.1 [0.5764, 0.54995, 0.59254, 0.56548, 0.56649]
MULTI WEIGHTED LOG LOSS : 0.57015 
0.5 [0.57755, 0.55161, 0.59464, 0.56404, 0.56691]
MULTI WEIGHTED LOG LOSS : 0.57095 
1 [0.57801, 0.55004, 0.59359, 0.56505, 0.56685]
MULTI WEIGHTED LOG LOSS : 0.57071 
5 [0.57933, 0.55036, 0.59298, 0.56286, 0.56855]
MULTI WEIGHTED LOG LOSS : 0.57081 
10 [0.57759, 0.55286, 0.59648, 0.56105, 0.56955]
MULTI WEIGHTED LOG LOSS : 0.57151 
100 [0.58838, 0.57936, 0.62427, 0.58124, 0.5892]
MULTI WEIGHTED LOG LOSS : 0.59248 
reg_alpha_0.0    [0.57999, 0.54996, 0.59388, 0.56439, 0.56667]
reg_alpha_0.1     [0.5764, 0.54995, 0.59254, 0.56548, 0.56649]
reg_alpha_0.5    [0.57755, 0.55161, 0.59464, 0.56404, 0.56691]
reg_alpha_1      [0.57801, 0.55004, 0.59359, 0.56505, 0.56685]
reg_alpha_5      [0.57933, 0.55036, 0.59298, 0.56286, 0.56855]
reg_alpha_10     [0.57759, 0.55286, 0.59648, 0.56105, 0.56955]
reg_alpha_100     [0.58838, 0.5793

In [37]:
%%time
final_dict = {}
#7
parameter_tuned = 'reg_lambda'
for i in [0.0001,0.00023,0.0005,0.001,0.005,0.01]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': 6, 
    'reg_alpha': 0.1, 
    'reg_lambda': i, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.0001 [0.5764, 0.54995, 0.59254, 0.56376, 0.56649]
MULTI WEIGHTED LOG LOSS : 0.56981 
0.00023 [0.5764, 0.54995, 0.59254, 0.56548, 0.56649]
MULTI WEIGHTED LOG LOSS : 0.57015 
0.0005 [0.5764, 0.54995, 0.59261, 0.56548, 0.5659]
MULTI WEIGHTED LOG LOSS : 0.57005 
0.001 [0.5764, 0.54968, 0.59261, 0.56469, 0.5659]
MULTI WEIGHTED LOG LOSS : 0.56984 
0.005 [0.57847, 0.54941, 0.59232, 0.56743, 0.56645]
MULTI WEIGHTED LOG LOSS : 0.57083 
0.01 [0.57834, 0.54906, 0.59288, 0.56433, 0.56517]
MULTI WEIGHTED LOG LOSS : 0.56995 
reg_lambda_0.0001      [0.5764, 0.54995, 0.59254, 0.56376, 0.56649]
reg_lambda_0.00023     [0.5764, 0.54995, 0.59254, 0.56548, 0.56649]
reg_lambda_0.0005       [0.5764, 0.54995, 0.59261, 0.56548, 0.5659]
reg_lambda_0.001        [0.5764, 0.54968, 0.59261, 0.56469, 0.5659]
reg_lambda_0.005      [0.57847, 0.54941, 0.59232, 0.56743, 0.56645]
reg_lambda_0.01       [0.57834, 0.54906, 0.59288, 0.56433, 0.56517]
dtype: object
CPU times: user 51min 9s, sys: 6.43 s, total: 51min 15s
Wal

In [38]:
%%time
final_dict = {}
#7
parameter_tuned = 'subsample'
for i in [0.45,0.55,0.65,0.75,0.85,0.95]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': 6, 
    'reg_alpha': 0.1, 
    'reg_lambda': 0.001, 
    'skip_drop': 0.44, 
    'subsample': i}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.45 [0.59853, 0.57488, 0.59784, 0.57983, 0.57539]
MULTI WEIGHTED LOG LOSS : 0.58531 
0.55 [0.5868, 0.55992, 0.58625, 0.57378, 0.56725]
MULTI WEIGHTED LOG LOSS : 0.57482 
0.65 [0.58232, 0.55812, 0.59304, 0.57077, 0.56938]
MULTI WEIGHTED LOG LOSS : 0.57474 
0.75 [0.5764, 0.54968, 0.59261, 0.56469, 0.5659]
MULTI WEIGHTED LOG LOSS : 0.56984 
0.85 [0.58654, 0.55706, 0.5915, 0.55684, 0.5743]
MULTI WEIGHTED LOG LOSS : 0.57330 
0.95 [0.59008, 0.562, 0.59881, 0.56095, 0.57514]
MULTI WEIGHTED LOG LOSS : 0.57746 
subsample_0.45    [0.59853, 0.57488, 0.59784, 0.57983, 0.57539]
subsample_0.55     [0.5868, 0.55992, 0.58625, 0.57378, 0.56725]
subsample_0.65    [0.58232, 0.55812, 0.59304, 0.57077, 0.56938]
subsample_0.75      [0.5764, 0.54968, 0.59261, 0.56469, 0.5659]
subsample_0.85      [0.58654, 0.55706, 0.5915, 0.55684, 0.5743]
subsample_0.95      [0.59008, 0.562, 0.59881, 0.56095, 0.57514]
dtype: object
CPU times: user 50min 4s, sys: 6.35 s, total: 50min 10s
Wall time: 6min 20s


In [39]:
%%time
final_dict = {}
#7
parameter_tuned = 'subsample'
for i in [0.73,0.74,0.75,0.76,0.77]:
    loss_list = []
    temp = train_metadata_kaggle.copy()
    
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    train_id = temp['object_id']
    del temp['object_id']
    
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'n_estimators': 4000, 
    'subsample_freq': 6, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 40,
    'min_child_weight': 150, 
    'min_split_gain': 0.15, 
    'num_leaves': 6, 
    'reg_alpha': 0.1, 
    'reg_lambda': 0.001, 
    'skip_drop': 0.44, 
    'subsample': i}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(round(loss_oof,5))
        #print(fold_,loss_oof)

    print(i,loss_list) 
    final_dict[parameter_tuned + '_' + str(i)] = loss_list
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
print(pd.Series(final_dict))

0.73 [0.57816, 0.5533, 0.58652, 0.56378, 0.56792]
MULTI WEIGHTED LOG LOSS : 0.56997 
0.74 [0.5783, 0.54968, 0.58956, 0.56561, 0.5665]
MULTI WEIGHTED LOG LOSS : 0.56994 
0.75 [0.5764, 0.54968, 0.59261, 0.56469, 0.5659]
MULTI WEIGHTED LOG LOSS : 0.56984 
0.76 [0.58192, 0.55302, 0.59044, 0.56234, 0.56849]
MULTI WEIGHTED LOG LOSS : 0.57125 
0.77 [0.58279, 0.55087, 0.59781, 0.56471, 0.56965]
MULTI WEIGHTED LOG LOSS : 0.57314 
subsample_0.73     [0.57816, 0.5533, 0.58652, 0.56378, 0.56792]
subsample_0.74      [0.5783, 0.54968, 0.58956, 0.56561, 0.5665]
subsample_0.75      [0.5764, 0.54968, 0.59261, 0.56469, 0.5659]
subsample_0.76    [0.58192, 0.55302, 0.59044, 0.56234, 0.56849]
subsample_0.77    [0.58279, 0.55087, 0.59781, 0.56471, 0.56965]
dtype: object
CPU times: user 1h 6min 8s, sys: 9.95 s, total: 1h 6min 18s
Wall time: 9min 18s


In [40]:
#final_dict2[final_dict2['column_name'].isin(most_imp_ones)]
#final_dict2[final_dict2['fold_sum'] == 1]

In [41]:
#final_dict2.head()

In [42]:
#final_dict2.to_csv('final_features3.csv',index=False)

In [43]:
#modify to work with kfold
#def smoteAdataset(Xig, yig, test_size=0.2, random_state=0):
def smoteAdataset(Xig_train, yig_train, Xig_test, yig_test):
    
        
    sm=SMOTE(random_state=51)
    Xig_train_res, yig_train_res = sm.fit_sample(Xig_train, yig_train.ravel())

        
    return Xig_train_res, pd.Series(yig_train_res), Xig_test, pd.Series(yig_test)

In [44]:
%%time
final_dict = {}

loss_list = []
temp = train_metadata_kaggle.copy()

print(temp.shape)
temp.fillna(0, inplace=True)

y = temp['target']
del temp['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

#print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
'n_estimators': 4000, 
'subsample_freq': 6, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 40,
'min_child_weight': 150, 
'min_split_gain': 0.15, 
'num_leaves': 6, 
'reg_alpha': 0.1, 
'reg_lambda': 0.001, 
'skip_drop': 0.44, 
'subsample': 0.75}
oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    trn_xa, trn_y, val_xa, val_y=smoteAdataset(trn_x.values, trn_y.values, val_x.values, val_y.values)
    trn_x=pd.DataFrame(data=trn_xa, columns=trn_x.columns)
    val_x=pd.DataFrame(data=val_xa, columns=val_x.columns)
    
    print(trn_x.shape,trn_y.shape,val_x.shape,val_y.shape)
    
    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=False,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
    #loss_list.append(loss_oof)
    print(fold_,loss_oof)

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
#final_dict[column_] = loss_list

(7848, 139)
(25900, 137) (25900,) (1574, 137) (1574,)
0 0.5407134604014688
(25900, 137) (25900,) (1572, 137) (1572,)
1 0.5505294469507865
(25900, 137) (25900,) (1571, 137) (1571,)
2 0.5930505215690117
(25914, 137) (25914,) (1567, 137) (1567,)
3 0.5412398994685668
(25914, 137) (25914,) (1564, 137) (1564,)
4 0.5650806630208185
MULTI WEIGHTED LOG LOSS : 0.55802 
CPU times: user 38min 49s, sys: 5.92 s, total: 38min 55s
Wall time: 5min 53s


In [None]:
#used_columns = used_columns1 + used_columns2 + used_columns3 

In [None]:
#train_metadata.drop([x for x in train_metadata.columns if x not in ['object_id'] + used_columns ] ,axis = 1,inplace=True)

In [None]:
#test_metadata = test_metadata[[x for x in test_metadata.columns if x in ['object_id'] + used_columns ]]

In [46]:
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

(7848, 139) (3492890, 138)


In [47]:
train_metadata_kaggle.head()

Unnamed: 0,object_id,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,flux_err_min,flux_err_max,flux_err_mean,flux_err_median,flux_err_std,flux_err_skew,detected_mean,flux_ratio_sq_sum,flux_ratio_sq_skew,flux_by_flux_ratio_sq_sum,flux_by_flux_ratio_sq_skew,flux_w_mean,flux_diff1,flux_diff2,flux_diff3,"0__fft_coefficient__coeff_0__attr_""abs""","0__fft_coefficient__coeff_1__attr_""abs""",0__kurtosis,0__skewness,"1__fft_coefficient__coeff_0__attr_""abs""","1__fft_coefficient__coeff_1__attr_""abs""",1__kurtosis,1__skewness,"2__fft_coefficient__coeff_0__attr_""abs""","2__fft_coefficient__coeff_1__attr_""abs""",2__kurtosis,2__skewness,"3__fft_coefficient__coeff_0__attr_""abs""","3__fft_coefficient__coeff_1__attr_""abs""",3__kurtosis,3__skewness,"4__fft_coefficient__coeff_0__attr_""abs""","4__fft_coefficient__coeff_1__attr_""abs""",4__kurtosis,4__skewness,"5__fft_coefficient__coeff_0__attr_""abs""","5__fft_coefficient__coeff_1__attr_""abs""",5__kurtosis,5__skewness,flux__length,flux__longest_strike_above_mean,flux__longest_strike_below_mean,flux__mean_abs_change,flux__mean_change,flux_by_flux_ratio_sq__longest_strike_above_mean,flux_by_flux_ratio_sq__longest_strike_below_mean,mjd__mean_abs_change,mjd__mean_change,mjd_diff_det,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,haversine,latlon1,hostgal_photoz_certain,A0_sum_flux,A0_mean_flux,A0_std_detected,A1_mean_detected,A2_sum_detected,A4_mean_detected,A5_std_detected,A5_mean_detected,percent_p2_region_minus_1,A2_min_flux,A5_sum_detected,__flux_percentile_ratio_mid50___5_,__flux_percentile_ratio_mid65___2_,__median_absolute_deviation___2_,__qso_log_chi2_qsonu___0_,__stetson_k___1_,__freq1_signif___2_,__stetson_k___2_,__freq3_amplitude1___1_,__median_absolute_deviation___2_.1,__percent_close_to_median___2_,__freq_varrat___5_,__freq_varrat___4_,__qso_log_chi2_qsonu___3_,__qso_log_chi2_qsonu___1_,__qso_log_chi2_qsonu___5_,__std___4_,__freq_varrat___3_,__amplitude___2_,outlierScore,hipd,lipd,highEnergy_transitory_1.0_TF,highEnergy_transitory_1.5_TF,lowEnergy_transitory_1.0_TF,lowEnergy_transitory_1.5_TF,A1_minus_3_sigma,A5_max_median_diff_flux,A5_minus_3_sigma,A5_max_mean_diff_flux,diff_A5_A4_max_min_flux,diff_A2_A1_max_min_flux,diff_A3_A2_median_min_flux,diff_A5_A4_max_median_flux,diff_A4_A3_max_median_flux,diff_A2_A0_median_min_flux,diff_A4_A3_max_mean_flux,diff_A5_A2_max_mean_flux,diff_A5_A3_max_mean_flux,diff_A4_A0_median_mean_flux,diff_A5_A4_max_mean_flux,diff_A2_A1_max_median_flux,diff_A5_A2_max_median_flux,diff_A5_A4_median_min_flux,diff_A4_A0_median_min_flux,diff_A4_A1_max_median_flux,diff_A4_A2_max_median_flux,diff_A5_A4_minus_1_sigma,diff_A5_A3_median_min_flux,diff_A5_A3_max_median_flux,diff_A3_A1_minus_1_sigma,diff_A3_A0_median_min_flux,diff_A3_A0_plus_1_sigma,diff_A1_A0_median_min_flux,diff_A4_A2_mean_min_flux,diff_A5_A1_plus_1_sigma,diff_A4_A1_median_mean_flux,diff_A3_A2_max_median_flux,diff_A5_A1_median_mean_flux,div_A4_A2_median_min_flux,div_A5_A2_median_min_flux,div_A5_A2_minus_1_sigma,div_A5_A4_median_mean_flux,div_A3_A0_plus_1_sigma,div_A4_A1_minus_1_sigma
0,615,-1100.440063,660.626343,-123.096998,-89.477524,394.109851,-0.34954,2.13051,12.845472,4.482743,3.835269,1.744747,1.62374,0.946023,2929669.0,0.812722,-960176600.0,-1.414322,-327.742307,1761.066406,-14.306331,-5.373326,205.036926,1628.427737,-1.475181,0.128917,22370.594834,2806.374162,-1.255123,0.41558,7780.500807,2805.598113,-1.409885,0.339918,7024.003068,2536.068846,-1.449858,0.293128,3245.366349,2741.539785,-1.548319,0.200096,2704.641265,2893.344217,-1.59282,0.125268,352.0,19.0,29.0,202.114067,1.999688,35.0,4.0,2.631898,2.631898,873.7903,0.0,0.0,,0.017,92,0.319006,-1.528827,0.0,-205.03693,-3.254554,0.3528,0.9653,57,0.983,0.2854,0.912,0.362,-682.0,52,5.56223e-26,6.71941e-20,368.129,6.21789,1.09173,5.49891,1.05349,114.465,368.129,0.172414,0.401664,0.129578,9.16612,9.50875,7.34498,289.277,0.110785,646.922,0.0,1.0,1.0,0,0,0,0,-2191.1619,463.71245,-931.78835,425.63799,-3.135,-466.64178,-47.8,-21.78266,-122.34195,309.302005,-128.932226,-320.49313,-141.20257,-40.826106,-12.270344,-270.84178,-414.07211,18.64766,211.745135,-663.13123,-392.28945,5.528684,-31.10921,-144.12461,531.05875,261.502005,133.631375,505.102005,-181.623534,31.34235,54.613224,-269.9475,64.12554,0.765601,0.810406,0.58077,0.800106,2.656105,0.352129
1,713,-14.735178,14.770886,-1.423351,-0.873033,6.471144,0.014989,0.639458,9.115748,2.35962,1.998217,1.509888,1.633246,0.171429,5886.068,3.439423,-28750.87,-3.454554,-4.884564,29.506064,-20.730002,-6.040676,190.427851,299.586559,-1.014003,0.260052,57.109047,192.539229,-1.09717,-0.087865,44.477327,191.057528,-1.188472,-0.022678,55.270113,212.522263,-1.142896,-0.167176,50.414646,203.892482,-1.190245,-0.064134,100.473776,143.963093,-0.797047,0.218182,350.0,50.0,73.0,2.935177,-0.050944,199.0,8.0,14.352571,14.352571,846.8017,1.6267,0.2552,45.4063,0.007,88,1.698939,3.258921,2.099614,-190.42786,-2.720398,0.3525,0.2678,15,0.0893,0.0,0.0,0.25,-10.07,0,0.0211907,0.0824318,5.10035,2.18719,1.0661,3.95669,1.08818,0.851103,5.10035,0.178571,0.369518,0.166179,2.79753,3.12481,0.659762,6.34953,0.111883,10.2985,0.875,1.909016,2.0,1,0,1,1,-18.157002,17.233897,-23.076394,16.565061,6.867315,-0.24998,2.3741,6.613788,-0.784106,-1.686373,-1.589087,5.241782,4.247778,0.484493,5.836865,0.95652,6.586856,0.253527,-0.143748,0.929589,-0.026931,-1.580997,-0.577947,5.829681,-0.705045,0.687727,1.070336,-0.479873,2.110778,0.607564,-0.350414,0.757175,-1.127337,1.155007,1.180482,1.353889,-6.188008,1.24364,1.085396
2,730,-19.159811,47.310059,2.267434,0.409172,8.022239,3.177854,0.695106,11.281384,2.471061,1.990851,1.721134,1.823726,0.069697,4124.452,5.480405,104650.2,5.989138,25.37311,66.46987,29.315018,2.619697,3.46179,4.729538,0.474215,0.35691,7.334944,13.515895,0.976374,0.471342,124.84525,119.500254,5.13129,2.385066,168.280524,162.799417,7.125665,2.662075,219.745132,202.532898,6.081065,2.537802,231.509177,199.28637,3.58313,1.680352,330.0,13.0,32.0,4.227614,-0.008131,4.0,222.0,3.580623,3.580623,78.7737,0.2262,0.0157,40.2561,0.021,42,1.81803,3.128522,0.229779,-3.46179,-0.04808,0.0,0.0,7,0.098,0.2715,0.0784,0.769,-2.85,4,0.000194228,0.55118,1.04253,-0.307228,0.933091,4.61663,0.634723,0.454918,1.04253,0.769231,0.500549,0.318256,3.04833,0.127758,1.66943,10.6048,0.292954,11.9218,0.0,1.0,1.0,0,0,0,0,-5.280586,44.767413,-35.458878,42.770664,19.473581,14.758601,2.7569,4.611787,7.244126,-0.142353,6.515316,24.176824,12.434728,-3.376547,5.919412,14.981301,24.264403,14.861794,3.356611,34.633917,19.652616,-2.391746,15.603858,11.855913,-3.289694,2.614547,9.647359,0.080347,4.894168,15.923825,-3.334674,12.40849,-2.027049,2.047061,6.494436,2.783931,0.604275,6.417455,3.84221
3,745,-15.494463,220.795212,8.909206,1.035895,27.558208,4.979826,0.56717,55.892746,2.555576,1.819875,3.537324,10.741655,0.173789,94161.65,9.611274,14391250.0,11.141069,152.835617,236.289675,26.521968,1.546038,129.421659,123.298327,4.629801,2.023211,320.174052,280.440312,50.86888,7.007099,543.845781,491.54827,36.088137,5.688194,807.123762,710.721942,16.392533,3.751603,735.528417,680.05528,13.747434,3.47642,591.037583,523.503586,12.134629,3.170857,351.0,19.0,115.0,7.065548,0.008044,4.0,201.0,2.061453,2.061453,123.6872,0.2813,1.1523,40.7951,0.007,90,0.495223,6.893743,0.890445,129.42166,1.797523,0.1655,0.125,16,0.2322,0.3364,0.1273,0.768,-2.16,7,0.0084016,0.546369,1.41645,1.4322,0.295163,3.96789,0.394683,3.59567,1.41645,0.892857,0.489589,0.360868,6.06886,5.84082,2.82044,32.7725,0.290652,111.477,0.0,1.0,1.0,0,0,0,0,-72.173977,138.763735,-67.434252,130.767152,-47.364906,27.09091,3.7217,-43.599234,-19.526731,-2.346763,-18.339091,-80.316526,-58.070623,-11.123477,-39.731532,29.01461,-81.607175,-3.765671,11.83355,-8.993331,-38.007941,4.620626,6.692942,-63.125965,-0.310113,1.374937,43.208654,-0.423063,16.757367,5.121609,-7.032285,-18.48121,-3.164583,6.4871,5.029966,0.688379,0.674005,8.00079,0.984685
4,1124,-16.543753,143.600189,7.145702,1.141288,20.051722,4.406298,0.695277,11.38369,2.753004,2.214854,1.933837,1.794938,0.173295,34324.18,7.868462,3015599.0,7.908174,87.85639,160.143942,22.411225,1.822792,41.639721,32.987125,0.822496,-0.332169,268.808929,207.812015,6.112295,2.377222,594.150153,498.50982,10.343254,3.075437,643.020183,555.512641,14.095862,3.603208,574.553907,524.107264,16.377058,3.904008,393.114268,357.907185,14.43447,3.657305,352.0,19.0,158.0,6.727352,0.012543,10.0,231.0,2.231855,2.231855,133.9113,0.2415,0.0176,40.4166,0.024,90,0.395162,-1.928064,0.245788,41.63972,0.660948,0.0,0.2241,18,0.1724,0.2578,0.0702,0.707,-2.084,4,0.0270226,0.601811,1.33779,-0.064359,0.674119,5.24444,0.560453,1.20558,1.33779,0.741379,0.673592,0.382847,5.07231,3.4079,2.31292,26.6333,0.250639,54.3781,0.375,1.0,1.0,0,0,1,0,-19.689575,107.184313,-57.407143,102.260844,-40.125818,68.962513,0.841,-34.670865,3.050778,-4.41273,4.962243,5.833122,-26.471001,-8.081169,-31.433244,69.766513,1.401623,-5.454953,10.903035,105.839001,36.072488,2.421924,9.019812,-31.620087,-11.709569,-3.57173,34.336172,-3.60873,14.121887,15.590844,-4.68209,33.02171,-1.444469,6.15162,4.316788,1.312571,0.603286,12.365708,4.881188


In [48]:
test_metadata_kaggle.head()

Unnamed: 0,object_id,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,flux_err_min,flux_err_max,flux_err_mean,flux_err_median,flux_err_std,flux_err_skew,detected_mean,flux_ratio_sq_sum,flux_ratio_sq_skew,flux_by_flux_ratio_sq_sum,flux_by_flux_ratio_sq_skew,flux_w_mean,flux_diff1,flux_diff2,flux_diff3,"0__fft_coefficient__coeff_0__attr_""abs""","0__fft_coefficient__coeff_1__attr_""abs""",0__kurtosis,0__skewness,"1__fft_coefficient__coeff_0__attr_""abs""","1__fft_coefficient__coeff_1__attr_""abs""",1__kurtosis,1__skewness,"2__fft_coefficient__coeff_0__attr_""abs""","2__fft_coefficient__coeff_1__attr_""abs""",2__kurtosis,2__skewness,"3__fft_coefficient__coeff_0__attr_""abs""","3__fft_coefficient__coeff_1__attr_""abs""",3__kurtosis,3__skewness,"4__fft_coefficient__coeff_0__attr_""abs""","4__fft_coefficient__coeff_1__attr_""abs""",4__kurtosis,4__skewness,"5__fft_coefficient__coeff_0__attr_""abs""","5__fft_coefficient__coeff_1__attr_""abs""",5__kurtosis,5__skewness,flux__length,flux__longest_strike_above_mean,flux__longest_strike_below_mean,flux__mean_abs_change,flux__mean_change,flux_by_flux_ratio_sq__longest_strike_above_mean,flux_by_flux_ratio_sq__longest_strike_below_mean,mjd__mean_abs_change,mjd__mean_change,mjd_diff_det,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,haversine,latlon1,hostgal_photoz_certain,A0_sum_flux,A0_mean_flux,A0_std_detected,A1_mean_detected,A2_sum_detected,A4_mean_detected,A5_std_detected,A5_mean_detected,percent_p2_region_minus_1,A2_min_flux,A5_sum_detected,__flux_percentile_ratio_mid50___5_,__flux_percentile_ratio_mid65___2_,__median_absolute_deviation___2_,__qso_log_chi2_qsonu___0_,__stetson_k___1_,__freq1_signif___2_,__stetson_k___2_,__freq3_amplitude1___1_,__median_absolute_deviation___2_.1,__percent_close_to_median___2_,__freq_varrat___5_,__freq_varrat___4_,__qso_log_chi2_qsonu___3_,__qso_log_chi2_qsonu___1_,__qso_log_chi2_qsonu___5_,__std___4_,__freq_varrat___3_,__amplitude___2_,outlierScore,hipd,lipd,highEnergy_transitory_1.0_TF,highEnergy_transitory_1.5_TF,lowEnergy_transitory_1.0_TF,lowEnergy_transitory_1.5_TF,A1_minus_3_sigma,A5_max_median_diff_flux,A5_minus_3_sigma,A5_max_mean_diff_flux,diff_A5_A4_max_min_flux,diff_A2_A1_max_min_flux,diff_A3_A2_median_min_flux,diff_A5_A4_max_median_flux,diff_A4_A3_max_median_flux,diff_A2_A0_median_min_flux,diff_A4_A3_max_mean_flux,diff_A5_A2_max_mean_flux,diff_A5_A3_max_mean_flux,diff_A4_A0_median_mean_flux,diff_A5_A4_max_mean_flux,diff_A2_A1_max_median_flux,diff_A5_A2_max_median_flux,diff_A5_A4_median_min_flux,diff_A4_A0_median_min_flux,diff_A4_A1_max_median_flux,diff_A4_A2_max_median_flux,diff_A5_A4_minus_1_sigma,diff_A5_A3_median_min_flux,diff_A5_A3_max_median_flux,diff_A3_A1_minus_1_sigma,diff_A3_A0_median_min_flux,diff_A3_A0_plus_1_sigma,diff_A1_A0_median_min_flux,diff_A4_A2_mean_min_flux,diff_A5_A1_plus_1_sigma,diff_A4_A1_median_mean_flux,diff_A3_A2_max_median_flux,diff_A5_A1_median_mean_flux,div_A4_A2_median_min_flux,div_A5_A2_median_min_flux,div_A5_A2_minus_1_sigma,div_A5_A4_median_mean_flux,div_A3_A0_plus_1_sigma,div_A4_A1_minus_1_sigma
0,13,-12.680235,42.765503,3.997127,0.616561,9.149645,2.037355,0.691634,11.257108,2.46181,1.972972,1.718101,1.826388,0.157576,7806.412424,4.771625,189634.6,5.396523,24.292155,55.445738,13.871398,2.282455,29.002872,37.684425,-0.24716,0.147622,92.03969,79.990817,10.567412,3.117684,164.640622,139.733762,4.512783,2.171455,300.547278,246.788411,2.951479,1.806797,408.305525,349.008202,0.094963,1.153076,324.51588,251.116361,0.072799,0.804929,330.0,15.0,32.0,4.426774,0.017885,9.0,222.0,2.350061,2.350061,119.8531,0.3193,0.0542,41.1123,0.019,1.851382,3.049709,0.337084,29.002872,0.402818,0.0,0.0769,11.0,0.3137,0.3254,0.1177,0.7114,-1.778855,6.0,0.005627,0.528087,1.38469,-0.007597,0.61019,4.81984,0.681284,1.78339,1.38469,0.711538,0.25503,0.09055,3.49944,2.78816,1.51079,13.2494,0.124063,13.1542,1.0,1.0,1.0,1,1,1,0,-13.032169,33.023636,-29.949324,30.439033,1.203725,-1.244573,3.14635,-9.275401,3.413229,-1.906972,0.572977,9.075555,-3.747502,-7.224033,-4.320479,1.245629,8.963043,10.479126,1.824685,19.484073,18.238444,-0.365809,11.064433,-5.862173,-1.251189,1.239378,13.511033,0.58323,8.574067,11.763127,-6.359525,14.825216,-1.404603,2.660059,7.321785,1.927288,0.342807,6.483993,1.698851
1,14,-11.142164,14.839427,0.884047,0.072856,3.399946,0.970525,0.690589,11.249375,2.45758,1.973559,1.717591,1.826703,0.012121,806.406927,11.486148,5525.817,12.348124,6.852393,25.981591,29.389389,3.791608,22.708482,26.159787,0.099267,0.502325,13.685195,27.630359,13.429229,3.109318,31.012899,33.427074,13.769006,3.247873,56.042403,59.784625,1.75084,1.082798,83.561278,77.494564,-0.322108,-0.099957,84.725142,27.013154,-0.348744,0.132025,330.0,10.0,15.0,3.055953,-0.025935,4.0,85.0,9.3351,9.3351,28.0053,0.6323,0.0179,42.8774,0.018,1.855173,3.009107,0.64372,22.70848,0.315396,0.0,0.01923,1.0,0.0,0.0,0.0,0.615,-2.655194,0.0,0.014907,0.278418,0.8139,0.266805,0.640702,3.31422,0.665239,1.07237,0.8139,0.711538,0.623417,0.258451,0.848093,1.33958,-0.158567,3.44694,0.306123,8.56024,0.0,1.0,1.0,0,0,0,0,-8.039201,13.70517,-14.805055,13.17815,10.362185,-1.201913,1.962808,6.524891,-2.977615,-1.396471,-1.799531,-0.690726,3.786349,0.761845,5.58588,0.557757,-0.730177,3.837294,4.357531,-6.697311,-7.255068,-1.984715,7.628487,3.547276,0.777005,0.566338,1.195116,0.363199,4.775541,4.119388,0.804491,-4.277454,-0.13452,3.142918,4.572011,1.764152,-1.279205,1.444674,0.735858
2,17,-14.202744,16.76128,0.791032,0.45839,3.886578,0.377131,0.66368,11.278636,2.702947,2.184483,1.922641,1.802497,0.014205,784.835502,9.509911,4124.4,9.923556,5.255113,30.964024,39.143819,5.89217,6.030862,46.450439,0.590786,-0.427429,53.366119,41.192678,4.127763,1.710048,58.60717,42.353288,8.642889,2.570936,26.975615,59.958041,3.484929,1.141959,57.222812,72.448018,2.085327,0.429582,76.240782,73.533783,0.063837,-0.175525,352.0,15.0,15.0,3.247664,0.02484,4.0,75.0,0.6907,0.6907,2.7628,0.8297,0.0605,43.6,0.016,0.309914,-1.49029,0.881446,6.030863,0.095728,0.0,0.0345,2.0,0.0,0.0,0.0,0.6206,-2.790141,0.0,0.000377,0.323662,0.845784,0.335972,0.798601,3.18365,0.696152,1.0381,0.845784,0.775862,0.669229,0.427519,0.684981,1.2138,0.066402,3.80032,0.493005,7.88173,0.0,1.0,1.0,0,0,0,0,-7.832717,14.683493,-19.129198,15.423722,8.578852,0.193475,2.646153,3.214523,1.212961,-4.544296,0.695796,3.460874,4.996827,-0.503209,4.301032,1.040025,2.09292,5.364329,3.199021,-0.081578,-1.121603,-2.637786,10.461493,4.427483,-0.369331,-1.898143,0.430738,-3.697746,7.461872,4.322237,0.368021,-2.334563,1.454529,3.440466,5.131144,3.490031,-2.137666,1.150274,1.425184
3,23,-12.631923,28.061138,0.970396,0.465986,4.100713,2.177402,0.697639,11.305429,2.480364,2.003704,1.723367,1.821626,0.018182,876.027511,8.010597,8293.673,9.227223,9.467365,40.693061,41.934474,4.298245,0.776942,14.92649,-0.580529,-0.058688,2.371129,22.476077,1.429683,-0.000571,34.44773,14.240274,1.223998,0.468594,58.911457,62.709899,4.269139,1.41438,115.918277,70.371095,7.377841,2.409871,112.547521,123.413344,1.793238,0.953107,330.0,10.0,12.0,3.299337,-0.001116,4.0,98.0,0.6025,0.6025,3.0125,0.6533,0.1479,42.964,0.023,1.845038,3.078301,0.757434,0.776942,0.010791,0.0,0.0,2.0,0.0392,0.0,0.0,0.3845,-4.254661,0.0,0.01371,0.402374,0.90629,0.039595,0.918965,3.55265,0.934093,0.445288,0.90629,0.538462,0.709745,0.40574,1.51212,0.154437,0.664241,4.61269,0.415919,4.82121,0.0,1.0,1.0,0,0,0,0,-5.651117,26.62437,-20.850721,25.854324,15.809828,-0.666392,1.807202,6.525865,8.105404,0.318818,7.272845,21.129028,14.127387,-0.958548,6.854541,-0.398379,21.839384,9.283963,0.246119,14.91514,15.313519,-3.093347,7.404063,14.631269,-0.262982,2.126019,2.369858,0.586831,0.966333,8.069753,-1.239923,7.208114,-0.911246,0.985034,2.896326,4.842085,0.700856,2.143172,1.24637
4,34,-13.239577,124.475609,4.58007,0.301366,19.862714,4.537527,0.679312,11.365292,2.746784,2.210577,1.93079,1.798629,0.065341,47612.580669,9.140568,4815012.0,10.251332,101.128982,137.715186,30.068359,1.361778,65.064185,55.191226,7.457425,2.135457,202.641918,191.410916,16.39371,4.133929,425.940451,414.455381,15.206929,4.036551,368.228927,360.630612,15.386561,4.030676,377.744797,301.98395,15.302549,3.99274,172.564336,289.757599,12.507289,3.513977,352.0,24.0,52.0,5.054175,-0.026764,10.0,203.0,1.138964,1.138964,25.0572,0.4617,0.0122,42.054,0.023,0.391772,-1.993709,0.467367,65.064186,1.032765,0.1768,0.069,5.0,0.069,0.2253,0.05264,0.9136,-3.601765,3.0,0.005099,0.597406,1.02602,0.976849,0.393851,3.96443,0.359669,4.31107,1.02602,0.931034,0.813533,0.614606,5.15045,4.771,2.482,22.2476,0.534743,64.0387,0.0,1.0,1.0,0,0,0,0,-38.006542,91.693475,-58.217627,87.733245,-11.865965,53.806883,0.909528,-18.816458,-1.051308,-3.743664,-0.873642,-29.398564,-18.15145,-5.04837,-17.277809,54.58217,-32.303375,6.950493,-2.46789,41.095253,-13.486917,-1.458527,7.316738,-19.867766,-6.951712,-2.834136,24.852925,-2.968377,-0.090388,6.114955,-2.169479,-12.435609,-0.63083,1.312649,3.015982,0.895981,0.720189,5.839107,1.5406


In [49]:
temp_test = test_metadata_kaggle.copy()

In [50]:
del temp_test['object_id']

In [51]:
temp_test.fillna(0,inplace = True)

In [52]:
print(temp.shape,temp_test.shape)

(7848, 137) (3492890, 137)


In [53]:
list(temp.columns) == list(temp_test.columns)

True

In [54]:
%%time
test_pred0 = pd.DataFrame()
test_pred1 = pd.DataFrame()
test_pred2 = pd.DataFrame()
test_pred3 = pd.DataFrame()
test_pred4 = pd.DataFrame()

list_of_df = [test_pred0,test_pred1,test_pred2,test_pred3,test_pred4]

for num,c in enumerate(clfs):
    print(num)
    for k in range(0,len(temp_test),500000):
        test_pred = pd.DataFrame(c.predict_proba(temp_test[ k:k+500000] ))
        list_of_df[num] = pd.concat([list_of_df[num],test_pred],axis=0)
        del test_pred

0
1
2
3
4
CPU times: user 3h 24min 12s, sys: 43.7 s, total: 3h 24min 55s
Wall time: 30min 56s


In [55]:
test_pred2 = pd.DataFrame()
test_pred2 = (list_of_df[0] + list_of_df[1] + list_of_df[2] + list_of_df[3] + list_of_df[4])/5

In [56]:
print(test_pred2.shape)

(3492890, 14)


In [57]:
#test_pred2 = pd.DataFrame(np.random.rand(10,14))

In [58]:
test_pred2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.000159,0.001374,0.000114,0.776456,0.065814,0.000287,0.087279,3.3e-05,0.000113,0.003116,0.000222,0.063397,0.00021,0.001427
1,9.5e-05,0.008413,9.7e-05,0.153913,0.036415,0.000203,0.035604,0.000157,0.000418,0.021963,0.001872,0.727573,0.000188,0.01309
2,0.000197,0.016769,0.000211,0.143189,0.098407,0.000419,0.036879,0.006017,0.00054,0.069276,0.005418,0.579812,0.000256,0.04261
3,0.000228,0.002427,0.000429,0.048005,0.025962,0.000429,0.094537,0.005604,0.000403,0.578677,0.000932,0.180624,0.000294,0.061447
4,5.8e-05,0.001247,6.1e-05,0.056203,0.260264,0.000113,0.009146,2.6e-05,8.2e-05,0.005087,7.4e-05,0.667468,7.2e-05,9.8e-05


In [59]:
temp_columns = ['object_id','class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']

In [60]:
test_pred2.columns = temp_columns[1:15]

In [61]:
def getUnknown(data):
    return ((((((data["mymedian"]) + (((data["mymean"]) / 2.0)))/2.0)) + (((((1.0) - (((data["mymax"]) * (((data["mymax"]) * (data["mymax"]))))))) / 2.0)))/2.0)

feats = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53',
         'class_62', 'class_64', 'class_65', 'class_67', 'class_88', 'class_90',
         'class_92', 'class_95']

In [62]:
klm = pd.DataFrame()
klm['mymean'] = test_pred2[feats].mean(axis=1)
klm['mymedian'] = test_pred2[feats].median(axis=1)
klm['mymax'] = test_pred2[feats].max(axis=1)

In [63]:
test_pred2['class_99'] = getUnknown(klm)

In [64]:
test_pred2.tail()

Unnamed: 0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
492885,0.000216,0.387601,0.000385,0.187291,0.117463,0.000445,0.129291,0.000314,0.00147,0.018458,0.002131,0.153796,0.000287,0.000851,0.244821
492886,0.00022,0.008853,0.00019,0.040771,0.208497,0.000268,0.015844,0.429028,0.000393,0.022308,0.000971,0.265852,0.000232,0.006573,0.241115
492887,0.000755,0.02604,0.000174,0.89503,0.034262,0.000385,0.030965,6.9e-05,0.000187,0.001288,0.000457,0.007972,0.000244,0.002171,0.079937
492888,6.7e-05,0.691828,9.4e-05,0.145591,0.004017,0.000128,0.012245,0.115592,0.000447,0.000294,0.00021,0.028841,9.9e-05,0.000548,0.176271
492889,0.000262,0.096712,0.000311,0.468399,0.181191,0.000568,0.06448,0.000126,0.000655,0.02804,0.001414,0.156185,0.000732,0.000925,0.23353


In [65]:
test_pred2 = test_pred2.reset_index(drop=True)

In [66]:
print(test_pred2.shape,test_id.shape)

(3492890, 15) (3492890,)


In [67]:
test_id.tail()

3492885    130787966
3492886    130787971
3492887    130787974
3492888    130788053
3492889    130788054
Name: object_id, dtype: int64

In [68]:
test_id = test_id.reset_index(drop=True)

In [69]:
test_id.index == test_pred2.index

array([ True,  True,  True, ...,  True,  True,  True])

In [70]:
%%time
test_pred = pd.concat([test_id,test_pred2],axis=1)

CPU times: user 89.6 ms, sys: 142 ms, total: 231 ms
Wall time: 230 ms


In [71]:
test_pred = test_pred[temp_columns]

In [72]:
test_pred.head()

Unnamed: 0,object_id,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
0,13,0.000159,0.001374,0.000114,0.776456,0.065814,0.000287,0.087279,3.3e-05,0.000113,0.003116,0.000222,0.063397,0.00021,0.001427,0.142108
1,14,9.5e-05,0.008413,9.7e-05,0.153913,0.036415,0.000203,0.035604,0.000157,0.000418,0.021963,0.001872,0.727573,0.000188,0.01309,0.163927
2,17,0.000197,0.016769,0.000211,0.143189,0.098407,0.000419,0.036879,0.006017,0.00054,0.069276,0.005418,0.579812,0.000256,0.04261,0.213046
3,23,0.000228,0.002427,0.000429,0.048005,0.025962,0.000429,0.094537,0.005604,0.000403,0.578677,0.000932,0.180624,0.000294,0.061447,0.211487
4,34,5.8e-05,0.001247,6.1e-05,0.056203,0.260264,0.000113,0.009146,2.6e-05,8.2e-05,0.005087,7.4e-05,0.667468,7.2e-05,9.8e-05,0.184613


In [73]:
print(test_pred.shape)

(3492890, 16)


In [74]:
%%time
test_pred.to_csv('test_pred_37.csv',index=False)

CPU times: user 1min 40s, sys: 1.36 s, total: 1min 42s
Wall time: 1min 42s


In [None]:
#!kaggle competitions submit -c PLAsTiCC-2018 -f test_pred_27.csv -m "Message"