# Libraries
---

In [1]:
import pandas as pd
import numpy as np
import random
import json
import os

from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy.optimize import minimize

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import warnings
warnings.simplefilter('ignore')

In [2]:
CFG = {
    'target': 'target',
    'n_class': 9,
    'n_clusters': 9,
    'n_components': 2,
    'seed': 2021,
    'n_estimators': 20000,
    'early_stopping_rounds': 500,
    'n_splits': 5,
    'verbose': False,
}

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(CFG['seed'])

In [4]:
# Function for variable description
def description(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['Mean'] = np.nanmean(df, axis=0).astype(df.dtypes)
    summary['Std'] = np.nanstd(df, axis=0).astype(df.dtypes)
    summary['Minimum'] = np.nanmin(df, axis=0).astype(df.dtypes)
    summary['Maximum'] = np.nanmax(df, axis=0).astype(df.dtypes)
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    summary['dimension'] = str(df.shape)
    return summary

# Loading data
---

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

features = [col for col in train.columns if 'feature_' in col]

target = train[CFG['target']].apply(lambda x: int(x.split("_")[-1])-1)
train[features] = np.log1p(train[features])
test[features] = np.log1p(test[features])

# Feature Engineering
---

In [6]:
all_df = pd.concat([train, test])

In [7]:
def feature_engineering(df, features):
    df['feature_fe_max'] = np.max(df[features], axis=1)
    df['feature_fe_mean'] = np.mean(df[features], axis=1)
    df['feature_fe_median'] = np.median(df[features], axis=1)
    df['feature_fe_std'] = np.std(df[features], axis=1)
    df['feature_fe_sum'] = np.sum(df[features], axis=1)

    df['feature_fe_num_nonzero'] = np.count_nonzero(df[features], axis=1)
    df['feature_fe_ratio_nonzero'] = df[f"{features[0].replace('0', 'fe')}_num_nonzero"] / len(features)
    df['feature_fe_num_unique'] = df[features].apply(pd.Series.nunique, axis=1)

    fe_feats = [
        'feature_fe_max', 'feature_fe_mean', 'feature_fe_median', 'feature_fe_std', 'feature_fe_sum',
        'feature_fe_num_nonzero', 'feature_fe_ratio_nonzero','feature_fe_num_unique'
    ]

    return df, fe_feats

all_df, fe_feats = feature_engineering(all_df, features)

In [8]:
pca = PCA(n_components=CFG['n_components'], random_state=CFG['seed'])
pca_feats = [f'feature_pca_{i}' for i in range(CFG['n_components'])]
all_df[pca_feats] = pca.fit_transform(all_df[features])

In [9]:
features += fe_feats + pca_feats

In [10]:
km = KMeans(n_clusters=CFG['n_clusters'], random_state=CFG['seed'])
km_features = [f'feature_km_{i}' for i in range(CFG['n_clusters'])]

all_df[km_features] = km.fit_transform(all_df[features])
features += km_features

In [11]:
train_df = all_df[features][:train.shape[0]].copy()
test_df = all_df[features][train.shape[0]:].copy()

## Quick look

In [12]:
description(train_df).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93
Name,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_fe_max,feature_fe_mean,feature_fe_median,feature_fe_std,feature_fe_sum,feature_fe_num_nonzero,feature_fe_ratio_nonzero,feature_fe_num_unique,feature_pca_0,feature_pca_1,feature_km_0,feature_km_1,feature_km_2,feature_km_3,feature_km_4,feature_km_5,feature_km_6,feature_km_7,feature_km_8
dtypes,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
Missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Uniques,42,37,48,59,30,55,40,26,28,51,32,37,28,40,30,105,27,15,21,66,28,31,55,21,50,40,20,49,78,58,18,36,77,41,31,39,28,26,39,47,35,56,42,53,26,27,87,63,39,23,40,49,25,34,72,65,35,28,31,92,99,70,61,70,22,41,23,57,42,52,49,31,37,91,50,142,118755,7,178728,111355,69,69,33,199776,199776,199776,199776,199776,199776,199776,199776,199776,199776,199776
Mean,0.317559,0.366212,0.551127,0.54098,0.271172,0.458862,0.298882,0.269256,0.600165,0.471293,0.527479,0.285994,0.897459,0.364591,0.563902,0.253952,0.503054,0.154739,0.678351,1.27822,0.60179,0.372478,0.300707,0.253896,0.270546,0.663873,0.367106,0.291825,0.509809,0.463181,0.306087,0.65328,0.289929,0.564505,0.298779,0.428395,0.232034,0.590658,0.345925,0.804877,0.416793,0.322198,0.285961,1.12743,0.388657,0.286826,0.422625,0.197524,0.461545,0.174116,0.733321,0.513819,0.287228,0.5455,1.25578,0.443628,0.72415,0.240476,0.374184,0.381471,0.459346,0.371727,0.612369,0.298289,0.267639,0.515214,0.20084,0.405996,0.30543,0.524091,0.380944,0.32456,0.445213,0.468588,0.172063,2.89769,0.447979,0.156733,0.660315,33.5985,26.5197,0.353596,8.77387,0.00240212,0.000733286,20.9885,33.8353,47.6686,68.2185,26.3553,22.1753,27.4753,35.4645,22.7223
Std,0.625406,0.683273,0.875113,0.861528,0.590506,0.695827,0.641819,0.546334,0.855059,0.771456,0.715704,0.635124,0.795766,0.648574,0.698558,0.669815,0.733975,0.400664,0.722319,1.09595,0.68247,0.658473,0.746814,0.513093,0.609644,0.862702,0.555285,0.549041,0.846828,0.739911,0.4929,0.780216,0.700613,0.718244,0.522443,0.660142,0.548994,0.718895,0.730636,0.810073,0.683387,0.649796,0.579152,1.00791,0.596471,0.548492,0.871088,0.581668,0.756581,0.488959,0.81761,0.734007,0.508851,0.686603,1.10432,0.905819,0.816447,0.502035,0.60074,0.687644,0.759477,0.669004,0.939326,0.696841,0.566766,0.811808,0.497322,0.797113,0.599755,0.737974,0.662353,0.581819,0.682153,0.946553,0.499659,0.903547,0.264802,0.303271,0.220087,19.8601,12.5478,0.167304,3.7613,2.45192,1.19167,13.4212,23.0996,20.5623,22.4913,20.3307,16.5688,14.5545,17.6862,12.6108
Minimum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-4.06963,-4.86682,4.29705,2.00119,7.19479,8.54988,3.07527,3.80111,5.58548,6.26074,4.89376
Maximum,4.12713,3.95124,4.17439,4.26268,3.66356,4.34381,3.78419,3.43399,3.66356,4.29046,3.52636,3.85015,3.63759,3.78419,3.49651,4.80402,3.3322,2.70805,3.13549,5.57595,3.43399,3.52636,4.82028,3.13549,4.2485,5.01064,3.21888,4.44265,4.66344,4.44265,3.13549,3.68888,4.36945,3.73767,3.61092,3.73767,3.7612,3.55535,3.73767,3.91202,4.40672,4.30407,3.98898,4.15888,3.3322,3.43399,4.77068,4.58497,3.71357,3.66356,4.04305,4.30407,3.66356,3.61092,4.65396,4.34381,3.85015,3.46574,3.43399,5.86647,5.44674,4.39445,4.63473,4.39445,3.2581,4.00733,3.21888,4.38203,4.02535,4.18965,4.21951,3.43399,4.12713,4.8752,3.97029,5.86647,1.94888,1.94591,1.51008,146.166,68,0.906667,33,12.2637,4.93492,124.496,153.026,89.2856,110.605,143.204,133.861,104.147,92.2704,114.732
First Value,0,0,1.94591,0.693147,0,0,0,0,2.07944,0,0,0,1.38629,0,0.693147,0,0,1.38629,1.38629,0.693147,0,1.09861,0,0,0,0,0,0,0.693147,0,0,0.693147,0,0,0,0,0,2.48491,0,0,0,0,0,2.30259,0,0,0,0,0,0,1.38629,0,0.693147,1.38629,0,0,0,0,0,0,0,0.693147,0.693147,0,0,1.38629,0,0,0,0,0,0,1.09861,0,0,2.48491,0.33164,0,0.623183,24.873,20,0.266667,8,-0.588711,-3.46788,10.4883,22.649,56.817,78.3978,13.0077,6.41689,30.3497,42.302,19.7164
Second Value,0,0,0,0,0,0,0,0,0,0,0,0,0.693147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.693147,0,0,0,0,0,0,0.693147,0,0,0,0,0,0,0,1.09861,0,0,0.693147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.09861,0,0,0,0,0,0,0.693147,0,1.09861,0.0755061,0,0.241897,5.66296,7,0.0933333,3,-3.2739,-1.13039,32.6228,3.7247,80.4735,101.977,13.0388,22.7843,53.7985,65.9097,42.8357


In [13]:
description(test_df).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93
Name,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_fe_max,feature_fe_mean,feature_fe_median,feature_fe_std,feature_fe_sum,feature_fe_num_nonzero,feature_fe_ratio_nonzero,feature_fe_num_unique,feature_pca_0,feature_pca_1,feature_km_0,feature_km_1,feature_km_2,feature_km_3,feature_km_4,feature_km_5,feature_km_6,feature_km_7,feature_km_8
dtypes,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
Missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Uniques,42,37,48,59,30,55,40,26,28,51,32,37,28,40,30,103,27,15,21,66,28,31,55,21,50,40,20,49,75,58,18,36,77,41,31,39,28,26,39,47,35,56,42,53,26,27,85,63,39,23,40,49,25,34,72,65,35,28,31,83,86,70,61,70,22,41,23,57,42,52,49,31,37,89,50,138,64935,7,91767,61699,67,67,33,99921,99921,99921,99921,99921,99921,99921,99921,99921,99921,99921
Mean,0.318766,0.364863,0.543809,0.544877,0.270039,0.460343,0.300553,0.26991,0.600826,0.471696,0.524353,0.289563,0.894842,0.360239,0.565063,0.248933,0.502577,0.154694,0.678938,1.28035,0.601662,0.371766,0.295843,0.254714,0.271338,0.666344,0.368151,0.291831,0.504734,0.457507,0.306077,0.653727,0.292233,0.567243,0.303132,0.429244,0.23346,0.594587,0.344692,0.798858,0.418569,0.319566,0.28461,1.12869,0.387712,0.284449,0.418429,0.197188,0.457519,0.170656,0.732308,0.511813,0.284439,0.542263,1.25103,0.441449,0.726404,0.24014,0.372675,0.378921,0.46051,0.370152,0.610145,0.296577,0.268924,0.514061,0.20042,0.404415,0.306384,0.524865,0.375748,0.322597,0.446297,0.466677,0.172742,2.89494,0.447223,0.155889,0.659778,33.5417,26.4828,0.353104,8.76501,-0.00480424,-0.00146657,20.9879,33.767,47.7098,68.2722,26.2895,22.1285,27.5315,35.5142,22.7633
Std,0.629787,0.682605,0.870421,0.864224,0.59021,0.697455,0.643313,0.54889,0.85512,0.770714,0.708382,0.641326,0.794139,0.645096,0.69865,0.660218,0.733121,0.400008,0.722022,1.10009,0.684058,0.657964,0.738123,0.514435,0.610991,0.865263,0.556252,0.549357,0.838005,0.737168,0.491775,0.78098,0.702127,0.722333,0.526603,0.661425,0.551275,0.722556,0.730853,0.806372,0.685669,0.645762,0.578889,1.01149,0.597172,0.543444,0.869048,0.582325,0.752223,0.482726,0.815336,0.735237,0.505751,0.681758,1.10766,0.902125,0.817344,0.500906,0.598915,0.686889,0.755885,0.668704,0.937021,0.697051,0.568711,0.811626,0.49794,0.791899,0.600366,0.739891,0.658792,0.579455,0.687486,0.945672,0.502244,0.904457,0.264513,0.302239,0.220345,19.8385,12.5464,0.167285,3.75792,2.45043,1.1937,13.3728,23.0812,20.596,22.5114,20.3164,16.5504,14.504,17.6799,12.5405
Minimum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-4.06963,-4.27372,4.46719,2.10592,7.317,8.06226,3.21239,3.64882,5.38531,6.25919,4.98522
Maximum,4.12713,3.95124,4.17439,4.26268,3.66356,4.34381,3.78419,3.43399,3.66356,4.29046,3.52636,3.85015,3.63759,3.78419,3.49651,4.80402,3.3322,2.70805,3.13549,5.57595,3.43399,3.52636,4.82028,3.13549,4.2485,5.01064,3.21888,4.44265,4.66344,4.44265,3.13549,3.68888,4.36945,3.73767,3.61092,3.73767,3.7612,3.55535,3.73767,3.91202,4.40672,4.30407,3.98898,4.15888,3.3322,3.43399,4.77068,4.58497,3.71357,3.66356,4.04305,4.30407,3.66356,3.61092,4.65396,4.34381,3.85015,3.46574,3.43399,5.86647,5.44674,4.39445,4.63473,4.39445,3.2581,4.00733,3.21888,4.38203,4.02535,4.18965,4.21951,3.43399,4.12713,4.85203,3.97029,5.86647,1.89747,1.94591,1.45598,142.31,66,0.88,33,12.957,4.81426,121.903,150.672,89.2856,110.605,140.768,131.347,101.381,89.408,112.055
First Value,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.09861,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.38629,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.38629,0.0331321,0,0.201542,2.48491,2,0.0266667,3,-3.78431,0.137565,38.1495,8.09196,85.7609,107.093,18.409,28.2857,59.2661,71.3135,48.3441
Second Value,0.693147,1.09861,0,0,0,0,0,0,0,0,0.693147,0,1.09861,1.09861,0,0,0.693147,0,0,0,1.09861,0,0,0.693147,0,1.38629,0,0.693147,0,0,1.09861,0,0,0.693147,0,0,0,0,0,0,0.693147,0,0,0,0,0.693147,0,0,0,0,0.693147,0,0.693147,1.38629,1.09861,0,0,0,0,0,0,0.693147,2.07944,0,0.693147,1.38629,0.693147,1.38629,0,0,0,0,1.38629,0,0,2.07944,0.32818,0,0.507846,24.6135,25,0.333333,5,-1.22793,0.0832205,8.60841,24.729,55.2359,77.1469,14.8045,6.88199,28.3399,40.4522,17.6366


# Modeling: LightGBM/XGBoost/CatBoost
---

In [14]:
lgb_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'n_estimators': CFG['n_estimators'],
    'random_state': CFG['seed'],
    'learning_rate': 3e-2,
    'max_depth': 73,
    'num_leaves': 42,
    'subsample': 0.84327,
    'colsample_bytree': 0.234,
    'reg_alpha': 16.724382543126165,
    'reg_lambda': 4.4252351797809535,
    'min_child_samples': 47,
    'min_child_weight': 0.0004586402479388673,
    'importance_type': 'gain',
}

xgb_params = {
    "objective": 'multi:softprob',
    'eval_metric': 'mlogloss',
    "max_depth": 6,
    "learning_rate": 5e-3,
    "colsample_bytree": 0.4,
    "subsample": 0.6,
    "reg_alpha": 6,
    "min_child_weight": 100,
    "n_jobs": -1,
    'num_class': CFG['n_class'],
    "seed": CFG['seed'],
    'tree_method': "gpu_hist",
}

cbt_params = {
    'bootstrap_type': 'Poisson',
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'random_seed': CFG['seed'],
    'task_type': 'GPU',
    'max_depth': 8,
    'learning_rate': 1e-2,
    'n_estimators': CFG['n_estimators'],
    'max_bin': 280,
    'min_data_in_leaf': 64,
    'l2_leaf_reg': 0.01,
    'subsample': 0.8,
}

In [None]:
seed_l = [CFG['seed'], 1963]

lgb_feature_importances = pd.DataFrame()
xgb_feature_importances = pd.DataFrame()
cbt_feature_importances = pd.DataFrame()

lgb_history, xgb_history, cbt_history = [], [], []
lgb_seed_oof, xgb_seed_oof, cbt_seed_oof = [], [], []
lgb_seed_pred, xgb_seed_pred, cbt_seed_pred = [], [], []

score_list = []

for seed in seed_l:
    print(f"===== SEED {seed} =====")
    kf = StratifiedKFold(n_splits=CFG['n_splits'], shuffle=True, random_state=seed)
    lgb_oof = np.zeros((train_df.shape[0], CFG['n_class']))
    xgb_oof = np.zeros((train_df.shape[0], CFG['n_class']))
    cbt_oof = np.zeros((train_df.shape[0], CFG['n_class']))
    lgb_pred, xgb_pred, cbt_pred = 0, 0, 0
    eval_fold_result = {}

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_df, y=target)):
        X_train, y_train = train_df.iloc[trn_idx], target.iloc[trn_idx]
        X_valid, y_valid = train_df.iloc[val_idx], target.iloc[val_idx]
        X_test = test_df

        # LightGBM
        lgb_params['random_state'] = seed
        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),
            eval_metric='multi_logloss',
            verbose=CFG['verbose'],
            early_stopping_rounds=CFG['early_stopping_rounds'],
            )

        fi_tmp = pd.DataFrame()
        fi_tmp["feature"] = clf.feature_name_
        fi_tmp["importance"] = clf.feature_importances_
        fi_tmp["fold"] = fold
        fi_tmp["seed"] = seed
        lgb_feature_importances = lgb_feature_importances.append(fi_tmp)
        
        lgb_oof[val_idx] = clf.predict_proba(X_valid)
        lgb_pred += clf.predict_proba(X_test) / CFG['n_splits']
        m_logloss = log_loss(y_valid, lgb_oof[val_idx])
        print(f"fold {fold} lgb multi_logloss: {m_logloss}")

        lgb_history.append(clf.evals_result_)
        score_list.append(['lgb', seed, fold, m_logloss])
        
        # XGBoost
        trn_data = xgb.DMatrix(data=X_train, label=y_train)
        val_data = xgb.DMatrix(data=X_valid, label=y_valid)
        
        xgb_params['seed'] = seed
        model = xgb.train(
            params=xgb_params,
            dtrain=trn_data,
            evals=[(trn_data, "train"), (val_data, "valid")],
            evals_result=eval_fold_result,
            #eval_metric='mlogloss',
            num_boost_round = CFG['n_estimators'],
            verbose_eval=CFG['verbose'],
            early_stopping_rounds=CFG['early_stopping_rounds'],
            )
        
        fi_tmp = pd.DataFrame()
        fi_tmp["feature"] = model.get_score(importance_type="gain").keys()
        fi_tmp["importance"] = model.get_score(importance_type="gain").values()
        fi_tmp["fold"] = fold
        fi_tmp["seed"] = seed
        xgb_feature_importances = xgb_feature_importances.append(fi_tmp)
        
        xgb_oof[val_idx] = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
        xgb_pred += model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit) / CFG['n_splits']
        m_logloss = log_loss(y_valid, xgb_oof[val_idx])
        print(f"fold {fold} xgb multi_logloss: {m_logloss}")

        xgb_history.append(eval_fold_result)
        score_list.append(['xgb', seed, fold, m_logloss])

        # CatBoostClassifier
        clf = cbt.CatBoostClassifier(**cbt_params)
        clf.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            use_best_model=True,
            early_stopping_rounds=CFG['early_stopping_rounds'],
            verbose=CFG['verbose']
            )

        fi_tmp = pd.DataFrame()
        fi_tmp["feature"] = clf.feature_names_
        fi_tmp["importance"] = clf.feature_importances_
        fi_tmp["fold"] = fold
        fi_tmp["seed"] = seed
        cbt_feature_importances = cbt_feature_importances.append(fi_tmp)
        
        cbt_oof[val_idx] = clf.predict_proba(X_valid)
        cbt_pred += clf.predict_proba(X_test) / CFG['n_splits']
        m_logloss = log_loss(y_valid, cbt_oof[val_idx])
        print(f"fold {fold} cbt multi_logloss: {m_logloss}")

        cbt_history.append(clf.get_evals_result())
        score_list.append(['cbt', seed, fold, m_logloss])

    lgb_logloss = log_loss(target, lgb_oof)
    score_list.append(['lgb', 'avg', 'oof', lgb_logloss])

    xgb_logloss = log_loss(target, xgb_oof)
    score_list.append(['xgb', 'avg', 'oof', xgb_logloss])

    cbt_logloss = log_loss(target, cbt_oof)
    score_list.append(['cbt', 'avg', 'oof', cbt_logloss])

    print("-" * 60)
    print(f"lgb multi_logloss: {lgb_logloss}")
    print(f"xgb multi_logloss: {xgb_logloss}")
    print(f"cbt multi_logloss: {cbt_logloss}\n")

    lgb_seed_oof.append(lgb_oof)
    xgb_seed_oof.append(xgb_oof)
    cbt_seed_oof.append(cbt_oof)
    lgb_seed_pred.append(lgb_pred)
    xgb_seed_pred.append(xgb_pred)
    cbt_seed_pred.append(cbt_pred)

score_df = pd.DataFrame(score_list, columns=['model', 'seed', 'fold', 'logloss_score'])
score_df.to_csv("score.csv", index=False)

===== SEED 2021 =====
fold 0 lgb multi_logloss: 1.7435952910417363
fold 0 xgb multi_logloss: 1.7430443747438522


## History

In [None]:
with open("lgb_history.json", 'w') as f:
    json.dump(lgb_history, f)

fig = plt.figure(figsize=(16, 4))
for i in range(CFG['n_splits']):
    x = [i for i in range(len(lgb_history[i]['valid_0']['multi_logloss']))]
    plt.plot(x, lgb_history[i]['valid_0']['multi_logloss'], label=f'valid_{i}')
    
plt.legend()
plt.title("LGBMClassifier validation history")
plt.tight_layout()

fig.savefig("lgb_history.png")

In [None]:
with open("xgb_history.json", 'w') as f:
    json.dump(xgb_history, f)

fig = plt.figure(figsize=(16, 4))
for i in range(CFG['n_splits']):
    x = [i for i in range(len(xgb_history[i]['train']['mlogloss']))]
    plt.plot(x, xgb_history[i]['train']['mlogloss'], label=f'train_{i}')
    plt.plot(x, xgb_history[i]['valid']['mlogloss'], label=f'valid_{i}')

plt.legend()    
plt.title("XGBClassifier validation history")
plt.tight_layout()

fig.savefig("xgb_history.png") 

In [None]:
with open("cbt_history.json", 'w') as f:
    json.dump(cbt_history, f)

fig = plt.figure(figsize=(16, 4))
for i in range(CFG['n_splits']):
    x = [i for i in range(len(cbt_history[i]['validation']['MultiClass']))]
    plt.plot(x, cbt_history[i]['learn']['MultiClass'], label=f'train_{i}')
    plt.plot(x, cbt_history[i]['validation']['MultiClass'], label=f'valid_{i}')

plt.legend()
plt.title("CatBoostClassifier validation history")
plt.tight_layout()

fig.savefig("cbt_history.png")

## Feature importances

In [None]:
fig = plt.figure(figsize=(10, 16), tight_layout=True)

order = list(lgb_feature_importances.groupby("feature").mean().sort_values("importance", ascending=False).index)
sns.barplot(x="importance", y="feature", data=lgb_feature_importances, order=order)
plt.title("LGBMClassifier feature impotrances")

fig.savefig("lgb_feature_importance.png")

In [None]:
fig = plt.figure(figsize=(10, 16), tight_layout=True)

order = list(xgb_feature_importances.groupby("feature").mean().sort_values("importance", ascending=False).index)
sns.barplot(x="importance", y="feature", data=xgb_feature_importances, order=order)
plt.title("XGBClassifier feature impotrances")

fig.savefig("xgb_feature_importance.png")

In [None]:
fig = plt.figure(figsize=(10, 16), tight_layout=True)

order = list(cbt_feature_importances.groupby("feature").mean().sort_values("importance", ascending=False).index)
sns.barplot(x="importance", y="feature", data=cbt_feature_importances, order=order)
plt.title("XGBClassifier feature impotrances")

fig.savefig("cbt_feature_importance.png")

# Post-process
---

## Seed averaging and ensemble

In [None]:
lgb_mean_seed_oof = np.mean(np.array(lgb_seed_oof),axis=0)
xgb_mean_seed_oof = np.mean(np.array(xgb_seed_oof),axis=0)
cbt_mean_seed_oof = np.mean(np.array(cbt_seed_oof),axis=0)

lgb_mean_seed_pred = np.mean(np.array(lgb_seed_pred),axis=0)
xgb_mean_seed_pred = np.mean(np.array(xgb_seed_pred),axis=0)
cbt_mean_seed_pred = np.mean(np.array(cbt_seed_pred),axis=0)

In [None]:
def objective_function(X, a0, a1, a2):
    oof = X[0]*a0 + X[1]*a1 + (1-X[0]-X[1])*a2
    
    return log_loss(target, oof)

In [None]:
target = train['target'].apply(lambda x: int(x.split("_")[-1])-1)

res = minimize(
    fun=objective_function,
    x0=[0.5, 0.5],
    args=tuple([lgb_mean_seed_oof, xgb_mean_seed_oof, cbt_mean_seed_oof]),
    method='Nelder-Mead')

res

In [None]:
oof = res.x[0]*lgb_mean_seed_oof + res.x[1]*xgb_mean_seed_oof + (1-res.x[0]-res.x[1])*cbt_mean_seed_oof
pred = res.x[0]*lgb_mean_seed_pred + res.x[1]*xgb_mean_seed_pred + (1-res.x[0]-res.x[1])*cbt_mean_seed_pred

print(f"logloss score: {log_loss(target, oof)}")

## Class optimization

In [None]:
def objective_function(X, a0, a1, a2, a3, a4, a5, a6, a7, a8):
    oof = np.array([X[0]*a0, X[1]*a1, X[2]*a2, X[3]*a3, X[4]*a4, X[5]*a5, X[6]*a6, X[7]*a7, X[8]*a8]).transpose()
    oof = oof / np.sum(oof, axis=1).reshape(-1, 1)
    
    return log_loss(target, oof)

In [None]:
target = train['target'].apply(lambda x: int(x.split("_")[-1])-1)

res = minimize(
    fun=objective_function,
    x0=[0.5 for _ in range(CFG['n_class'])],
    args=tuple(oof[:, i] for i in range(CFG['n_class'])),
    method='Nelder-Mead')

res

In [None]:
oof = np.array([res.x[i]*oof[:, i] for i in range(CFG['n_class'])]).transpose()
oof = oof / np.sum(oof, axis=1).reshape(-1, 1)

pred = np.array([res.x[i]*pred[:, i] for i in range(CFG['n_class'])]).transpose()
pred = pred / np.sum(pred, axis=1).reshape(-1, 1)

print(f"logloss score: {log_loss(target, oof)}")

# Check results
---

## Target distribution

In [None]:
plt.figure(figsize=(16, 4), tight_layout=True)

plt.subplot(1, 3, 1)
target.hist()

plt.subplot(1, 3, 2)
pd.Series(oof.argmax(axis=1)).hist()

plt.subplot(1, 3, 3)
pd.Series(pred.argmax(axis=1)).hist()

## Confusion matrix

In [None]:
cm = confusion_matrix(target, oof.argmax(axis=1))

plt.figure(figsize=((16,4)))
sns.heatmap(cm, annot=True, fmt='5d', cmap='Blues')
plt.savefig("confusion_matrix.png")

## Classification report

In [None]:
print(classification_report(target, oof.argmax(axis=1), digits=4))

report = pd.DataFrame(classification_report(target, oof.argmax(axis=1), digits=4, output_dict=True)).transpose()
report.to_csv("report.csv")

# Submission
---

In [None]:
submission.iloc[:, 1:] = pred  
submission.to_csv("submission.csv", index=False)

In [None]:
plt.figure(figsize=(16, 8), tight_layout=True)
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.title(f"Class_{i+1}")
    submission[f'Class_{i+1}'].hist()