In [3]:
# import modules
import warnings
warnings.filterwarnings('ignore')

import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, log_loss, f1_score

import random
from math import floor
from scipy.stats import mode, scoreatpercentile

print("Seaborn version : ", sns.__version__)
sns.set()
#sns.set_style('whitegrid')
sns.set_color_codes()
sns.set_theme(style="ticks", palette="pastel")

Seaborn version :  0.11.1


In [4]:
# upload data
train = pd.read_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/train.csv',
                   index_col=0)
test = pd.read_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/test.csv',
                  index_col=0)
submit = pd.read_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/sample_submission.csv')

In [5]:
for df in [train, test]:
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['gender', 'car', 'reality', 'work_phone', 'phone', 'email']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    df.drop('FLAG_MOBIL', axis = 1, inplace = True)
    df['adj_DAYS_EMPLOYED_replace_0'] = -df.DAYS_EMPLOYED.replace({365243 : 0})
    df['DAYS_EMPLOYED_missing'] = (df.DAYS_EMPLOYED == 365243).astype('int')
    df['adj_begin_month'] = -df.begin_month
    df['adj_income_type'] = df.income_type
    df.loc[df.income_type == 'Student', 'adj_income_type'] = 'Working'
    df['adj_edu_type'] = df.edu_type 
    df.loc[df.edu_type == 'Academic degree', 'adj_edu_type'] = 'Higher education'
    df['adj_family_type'] = df['family_type']
    df['adj_family_type'].loc[(df.family_type == 'Single / not married')&(df.family_size - df.child_num == 2)] = 'Married'
    df['exp_num'] = 0
    df['exp_num'].loc[df.family_type == 'Married'] = 2
    df['exp_num'].loc[df.family_type == 'Civil marriage'] = 2
    df['exp_num'].loc[df.family_type == 'Separated'] = 1
    df['exp_num'].loc[df.family_type == 'Single / not married'] = 1
    df['exp_num'].loc[df.family_type == 'Widow'] = 1
    df['odd_family_size'] = 0
    df['odd_family_size'].loc[(df.family_size - df.child_num) != df.exp_num] = 1
    df['_single_parents'] = ((df.family_type == 'Single / not married')&(df.child_num != 0)).astype('int')
    df['_single_live'] = (df.family_size == 1).astype('int')
    df['adj_occyp_type'] = df.occyp_type.fillna('missing')
    df['_missing_occyp_type'] = df.occyp_type.isna().astype('int')
    df.loc[(df.DAYS_EMPLOYED == 365243)&(df.occyp_type.isna()), 'adj_occyp_type'] = 'inoccyp'
    df.loc[(df.DAYS_EMPLOYED != 365243)&(df.occyp_type.isna()), 'adj_occyp_type'] = 'non_enter'
    df['_age'] = -df.DAYS_BIRTH/365.25

    df['ID'] = \
    df['gender'].astype('str') + \
    df['car'].astype('str') + \
    df['reality'].astype('str') + '_' + \
    df['child_num'].astype('str') + '_' + \
    df['income_total'].astype('str') + '_' + \
    df['income_type'].astype('str') + '_' + \
    df['family_type'].astype('str') + '_' + \
    df['house_type'].astype('str') + '_' + \
    df['phone'].astype('str') + '_' + \
    df['email'].astype('str') + '_' + \
    df['family_size'].astype('str') + '_' + \
    df['DAYS_BIRTH'].astype('str') + '_' + \
    df['DAYS_EMPLOYED'].astype('str') + '_' + \
    df['occyp_type'].astype('str') 
    
    df['_card_num'] = df.groupby('ID').ID.transform(len)
    
    df['adj_begin_month'] = -df.begin_month
    df['_begin_month_max'] = df.groupby('ID').adj_begin_month.transform(max)
    df['_begin_month_mean'] = df.groupby('ID').adj_begin_month.transform(np.mean)
    df['_begin_month_min'] = df.groupby('ID').adj_begin_month.transform(min)

In [6]:
personal_info = train.drop(['credit', 'begin_month'], axis = 1).drop_duplicates(subset="ID", keep='first', inplace=False, ignore_index=True)
personal_info_test = pd.concat([train.drop(['credit'], axis = 1), test]).drop(['begin_month'], axis = 1).drop_duplicates(subset="ID", keep='first', inplace=False, ignore_index=True)

for personal_df in [personal_info, personal_info_test]:
    ####### 양육비 변수 ######
    child_fee = (personal_df.income_total/personal_df.family_size)[(personal_df._age > 33) & (personal_df._age < 37) & (personal_df.child_num == 1)].mean()

    def child_fee_age_weights(x) : 
        from scipy.stats import norm
        sd = personal_df._age.std()
        return norm(35, scale = sd).pdf(x) / norm(35, scale = sd).pdf(35)
    personal_df['child_fees'] = (np.log(personal_df.child_num + 1)/np.log(2)) * (child_fee) * personal_df._age.apply(child_fee_age_weights)

    ####### 차유지비 변수 ######
    personal_df.income_total.median()*0.1
    def car_weight(x):
        _med = personal_df.income_total.median()
        _max = personal_df.income_total.max()
        if x < _med : 
            return 1
        else:
            return 1+(x-_med)/(_max-_med)*5
    personal_df['car_fees'] = personal_df.income_total.median()*0.1*personal_df.car*personal_df.income_total.apply(car_weight)

    ####### 여유금 변수 ######
    personal_df['_save_income'] = personal_df.income_total - personal_df.child_fees - personal_df.car_fees

    ####### 능력 변수 ######
    personal_df['_ability_income_per_age'] = 0
    personal_df['_ability_employ_per_age'] = 0
    personal_df['_ability_income_per_emp'] = 0
    for i in range(len(personal_df)) : 
        L_age = personal_df._age[i] - 3
        R_age = personal_df._age[i] + 3
        _gen = personal_df.gender[i]
        _ages_df = personal_df[['income_total', 'adj_DAYS_EMPLOYED_replace_0']][(personal_df._age > L_age) & (personal_df._age < R_age) & (personal_df.gender == _gen)]
        _med_income = _ages_df['income_total'].median()
        _std_income = _ages_df['income_total'].std()
        _med_employ = _ages_df['adj_DAYS_EMPLOYED_replace_0'].median()
        _std_employ = _ages_df['adj_DAYS_EMPLOYED_replace_0'].std()
        _n_df       = _ages_df.shape[0]
        personal_df.loc[i, '_ability_income_per_age'] = (personal_df.income_total.iloc[i] - _med_income) / (_std_income /np.sqrt(_n_df))
        personal_df.loc[i, '_ability_employ_per_age'] = (personal_df.adj_DAYS_EMPLOYED_replace_0.iloc[i] - _med_employ) / (_std_employ/np.sqrt(_n_df))

        if personal_df.adj_DAYS_EMPLOYED_replace_0.iloc[i] != 0:    
            L_emp = personal_df.adj_DAYS_EMPLOYED_replace_0 - 365
            R_emp = personal_df.adj_DAYS_EMPLOYED_replace_0 + 365
            _emps_df = personal_df.income_total[(personal_df.adj_DAYS_EMPLOYED_replace_0 > L_emp)&(personal_df.adj_DAYS_EMPLOYED_replace_0 < R_emp)]
            _med = _emps_df.median()
            _std = _emps_df.std()
            personal_df.loc[i, '_ability_income_per_emp'] = (personal_df.income_total.iloc[i] - _med)/(_std/np.sqrt(_n_df))

In [7]:
train  = pd.merge(train, personal_info[['ID', '_save_income', 'child_fees', 'car_fees', '_ability_income_per_age', '_ability_employ_per_age', '_ability_income_per_emp']], on = 'ID', how = 'left')
test = pd.merge(test, personal_info_test[['ID', '_save_income', 'child_fees', 'car_fees', '_ability_income_per_age', '_ability_employ_per_age', '_ability_income_per_emp']], on = 'ID', how = 'left')

In [8]:
for df in [train, test]:
    #카드 소유자가 가진 카드들에 할당할 수 있는 금액을 고려
    df['_income_per_cards']  = df.income_total / np.log(1+df._card_num)
    df['_save_per_cards']    = df._save_income / np.log(1+df._card_num)

    # 가족들에게 할당 될 수 있는 소득 그리고 여유자금을 고려
    df['_income_per_family'] = df.income_total / df.family_size
    df['_save_per_family']   = df._save_income / df.family_size

In [9]:
for df in [train, test] : 
    df['_age'] = df['_age'].apply(lambda x: floor(x))
    df['adj_DAYS_EMPLOYED_mm'] = df['adj_DAYS_EMPLOYED_replace_0'].apply(lambda x: floor(x/30))
    df['adj_DAYS_EMPLOYED_yy'] = df['adj_DAYS_EMPLOYED_replace_0'].apply(lambda x: floor(x/365.25))
    df['log_income_total'] = df['income_total'].apply(lambda x: np.log(1+x))


In [10]:
# dont touch : original
features = ['gender', 
            'car', 
            'reality', 
            #'child_num', 
            'income_total', 
            'adj_income_type', 
            'adj_edu_type', 
            'adj_family_type', 
            'house_type', 
            '_age', 
            'adj_DAYS_EMPLOYED_replace_0', 
            #'DAYS_EMPLOYED_missing',
            'adj_begin_month', '_begin_month_min', #'_begin_month_max', '_begin_month_mean', 
            'work_phone', 
            'phone', 
            'email', 
            'adj_occyp_type', 
            '_missing_occyp_type', 
            #'odd_family_size', '_single_parents', '_single_live', 
            'ID', 
            '_card_num', 
            '_income_per_family', '_income_per_cards', 
            '_save_income', '_save_per_cards', '_save_per_family', 
            '_ability_income_per_age', '_ability_employ_per_age', '_ability_income_per_emp']

In [52]:
features = ['gender', 'car', 'reality', 'child_num', 'adj_income_type', 'adj_edu_type', 
            'adj_family_type', 'house_type', '_age', 'adj_DAYS_EMPLOYED_replace_0', 'adj_DAYS_EMPLOYED_mm', 'adj_DAYS_EMPLOYED_yy',
            'adj_begin_month', 'work_phone', 'phone', 'email', 'adj_occyp_type', 
            'ID', '_begin_month_min',
            '_income_per_family', 'log_income_total',
            '_ability_income_per_age']

In [12]:
features = ['gender', 'car', 'reality', 'child_num',
            'adj_income_type', 'adj_edu_type', 'adj_family_type', 'house_type', 'adj_occyp_type',
            'work_phone', 'phone', 'email',
            '_age',
            'adj_DAYS_EMPLOYED_replace_0', 'ID',
            '_income_per_family', '_save_income', 
            'adj_begin_month','_begin_month_min'            
           ]

In [53]:
numerical_feats = train[features].dtypes[train[features].dtypes != "object"].index.tolist()
#numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train[features].dtypes[train[features].dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  16
Number of Categorical features:  6


In [54]:
train[features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       26457 non-null  int64  
 1   car                          26457 non-null  int64  
 2   reality                      26457 non-null  int64  
 3   child_num                    26457 non-null  int64  
 4   adj_income_type              26457 non-null  object 
 5   adj_edu_type                 26457 non-null  object 
 6   adj_family_type              26457 non-null  object 
 7   house_type                   26457 non-null  object 
 8   _age                         26457 non-null  int64  
 9   adj_DAYS_EMPLOYED_replace_0  26457 non-null  int64  
 10  adj_DAYS_EMPLOYED_mm         26457 non-null  int64  
 11  adj_DAYS_EMPLOYED_yy         26457 non-null  int64  
 12  adj_begin_month              26457 non-null  float64
 13  work_phone      

In [55]:
categorical_feats + ['gender', 'car', 'reality', 'phone', 'email', 'work_phone']

['adj_income_type',
 'adj_edu_type',
 'adj_family_type',
 'house_type',
 'adj_occyp_type',
 'ID',
 'gender',
 'car',
 'reality',
 'phone',
 'email',
 'work_phone']

In [56]:
#from category_encoders.ordinal import OrdinalEncoder
#encoder = OrdinalEncoder(categorical_feats)
#train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
#test[categorical_feats] = encoder.transform(test[categorical_feats])

In [57]:
#train['ID'] = train['ID'].astype('int64')
#test['ID'] = test['ID'].astype('int64')

In [58]:
#from sklearn.cluster import KMeans
#kmeans_train = train.drop(['credit'], axis=1)[features]
#kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
#train['cluster'] = kmeans.predict(kmeans_train)
#test['cluster'] = kmeans.predict(test[features])
#features.append('cluster')

In [59]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
#test[numerical_feats] = scaler.transform(test[numerical_feats])

In [60]:
target = 'credit'
X = train.drop(target, axis=1)[features]
y = train[target]
X_test = test[features]

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       26457 non-null  int64  
 1   car                          26457 non-null  int64  
 2   reality                      26457 non-null  int64  
 3   child_num                    26457 non-null  int64  
 4   adj_income_type              26457 non-null  object 
 5   adj_edu_type                 26457 non-null  object 
 6   adj_family_type              26457 non-null  object 
 7   house_type                   26457 non-null  object 
 8   _age                         26457 non-null  int64  
 9   adj_DAYS_EMPLOYED_replace_0  26457 non-null  int64  
 10  adj_DAYS_EMPLOYED_mm         26457 non-null  int64  
 11  adj_DAYS_EMPLOYED_yy         26457 non-null  int64  
 12  adj_begin_month              26457 non-null  float64
 13  work_phone      

In [63]:
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(X, y):
    folds.append((train_idx,valid_idx))

In [64]:
parameters =  {'learning_rate': 0.22999586428143728, 
               'bagging_temperature': 0.022592797420156956, 
               'n_estimators': 2764, 'max_depth': 4, 
               'random_strength': 32, 
               'colsample_bylevel': 0.6332063738136893, 
               'l2_leaf_reg': 8.147757462899138e-06, 
               'min_child_samples': 84, 
               'max_bin': 307, 
               'od_type': 'Iter','boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}

#{'colsample_bylevel': 0.07591531396851062, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}

In [66]:
random.seed(42)
cat_models={}

cat_features = categorical_feats+ ['gender', 'car', 'reality', 'phone', 'email', 'work_phone']

for fold in range(20):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train = X.iloc[train_idx] 
    X_valid = X.iloc[valid_idx]
    y_train = y[train_idx]
    y_valid = y[valid_idx]

    cat = CatBoostClassifier(**parameters)
    cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
    cat_models[fold] = cat
    print(f'================================================================================\n\n')

0:	learn: 1.0075892	test: 1.0076305	test1: 1.0082746	best: 1.0082746 (0)	total: 42.1ms	remaining: 1m 56s
100:	learn: 0.7092165	test: 0.4962936	test1: 0.6637947	best: 0.6637947 (100)	total: 3.34s	remaining: 1m 27s
200:	learn: 0.6884090	test: 0.4813628	test1: 0.6590060	best: 0.6560839 (177)	total: 6.93s	remaining: 1m 28s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6560838651
bestIteration = 177

Shrink model to first 178 iterations.


0:	learn: 1.0076516	test: 1.0076620	test1: 1.0076559	best: 1.0076559 (0)	total: 36.4ms	remaining: 1m 40s
100:	learn: 0.7094452	test: 0.4971911	test1: 0.6583706	best: 0.6583706 (100)	total: 3.33s	remaining: 1m 27s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6562558819
bestIteration = 125

Shrink model to first 126 iterations.


0:	learn: 1.0076575	test: 1.0076677	test1: 1.0076123	best: 1.0076123 (0)	total: 41.3ms	remaining: 1m 54s
100:	learn: 0.7101340	test: 0.4962664	test1: 0.6467890	best: 0.6467890 (100)	tota

0:	learn: 1.0076873	test: 1.0077165	test1: 1.0074983	best: 1.0074983 (0)	total: 42.5ms	remaining: 1m 57s
100:	learn: 0.7100783	test: 0.4925416	test1: 0.6685864	best: 0.6676642 (92)	total: 3.58s	remaining: 1m 34s
200:	learn: 0.6897428	test: 0.4774302	test1: 0.6665351	best: 0.6659949 (188)	total: 7.56s	remaining: 1m 36s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6659949227
bestIteration = 188

Shrink model to first 189 iterations.


0:	learn: 1.0076433	test: 1.0076760	test1: 1.0076831	best: 1.0076831 (0)	total: 51.1ms	remaining: 2m 21s
100:	learn: 0.7081342	test: 0.4954872	test1: 0.7007473	best: 0.7005288 (95)	total: 3.73s	remaining: 1m 38s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6985732192
bestIteration = 116

Shrink model to first 117 iterations.


0:	learn: 1.0076429	test: 1.0076590	test1: 1.0077665	best: 1.0077665 (0)	total: 48.2ms	remaining: 2m 13s
100:	learn: 0.7077298	test: 0.4940290	test1: 0.6749161	best: 0.6744760 (85)	total: 

In [24]:
def objective(trial):
    param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    }

    X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)

    cat_features = categorical_feats + ['gender', 'car', 'reality', 'phone', 'email', 'work_phone']
    cat = CatBoostClassifier(**param)
    cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
    cat_pred = cat.predict_proba(X_valid)
    log_score = log_loss(y_valid, cat_pred)

    return log_score

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2021-06-13 15:39:55,374][0m A new study created in memory with name: cat_parameter_opt[0m


0:	learn: 1.0826643	test: 1.0826643	test1: 1.0833388	best: 1.0833388 (0)	total: 54.4ms	remaining: 6m 53s


[32m[I 2021-06-13 15:39:56,123][0m Trial 0 finished with value: 0.895646511326092 and parameters: {'learning_rate': 0.03574712922600244, 'bagging_temperature': 63.512210106407046, 'n_estimators': 7588, 'max_depth': 11, 'random_strength': 15, 'colsample_bylevel': 0.49359671220172163, 'l2_leaf_reg': 1.7519275289243016e-06, 'min_child_samples': 88, 'max_bin': 380, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.895646511326092.[0m


100:	learn: 0.8792809	test: 0.8792809	test1: 0.8956566	best: 0.8956465 (94)	total: 392ms	remaining: 29.1s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.8956465113
bestIteration = 94

Shrink model to first 95 iterations.
0:	learn: 0.9932747	test: 0.9932747	test1: 0.9937626	best: 0.9937626 (0)	total: 23.7ms	remaining: 1m 9s
100:	learn: 0.6933543	test: 0.7085416	test1: 0.7853501	best: 0.7853501 (100)	total: 2.57s	remaining: 1m 11s
200:	learn: 0.5878195	test: 0.5021087	test1: 0.7184069	best: 0.7184069 (200)	total: 5.11s	remaining: 1m 8s


[32m[I 2021-06-13 15:40:02,547][0m Trial 1 finished with value: 0.7166143314176118 and parameters: {'learning_rate': 0.2708160864249968, 'bagging_temperature': 21.368329072358772, 'n_estimators': 2911, 'max_depth': 6, 'random_strength': 18, 'colsample_bylevel': 0.5825453457757226, 'l2_leaf_reg': 1.5747445384650815e-05, 'min_child_samples': 46, 'max_bin': 287, 'od_type': 'IncToDec'}. Best is trial 1 with value: 0.7166143314176118.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7166143314
bestIteration = 206

Shrink model to first 207 iterations.
0:	learn: 1.0866355	test: 1.0866796	test1: 1.0866776	best: 1.0866776 (0)	total: 21.8ms	remaining: 1m 51s
100:	learn: 0.7973409	test: 0.6967657	test1: 0.7894287	best: 0.7894287 (100)	total: 17.5s	remaining: 14m 33s
200:	learn: 0.7465083	test: 0.5877319	test1: 0.7414034	best: 0.7414034 (200)	total: 43.7s	remaining: 17m 50s
300:	learn: 0.7272076	test: 0.5407806	test1: 0.7270565	best: 0.7270565 (300)	total: 1m 12s	remaining: 19m 16s
400:	learn: 0.7066840	test: 0.5118826	test1: 0.7149905	best: 0.7149905 (400)	total: 1m 36s	remaining: 18m 52s
500:	learn: 0.6004482	test: 0.4698172	test1: 0.7021202	best: 0.7021096 (498)	total: 2m 55s	remaining: 26m 53s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7016938512
bestIteration = 513

Shrink model to first 514 iterations.


[32m[I 2021-06-13 15:44:04,033][0m Trial 2 finished with value: 0.7016938511665621 and parameters: {'learning_rate': 0.027010527749605478, 'bagging_temperature': 0.2920433847181412, 'n_estimators': 5105, 'max_depth': 14, 'random_strength': 20, 'colsample_bylevel': 0.708540663048167, 'l2_leaf_reg': 1.7776512920172654e-05, 'min_child_samples': 9, 'max_bin': 382, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.7016938511665621.[0m


In [69]:
submit

Unnamed: 0,index,0,1,2
0,26457,0,0,0
1,26458,0,0,0
2,26459,0,0,0
3,26460,0,0,0
4,26461,0,0,0
...,...,...,...,...
9995,36452,0,0,0
9996,36453,0,0,0
9997,36454,0,0,0
9998,36455,0,0,0


In [68]:
submit.iloc[:,1:] = 0

In [70]:
for fold in range(5):
      submit.iloc[:,1:] += cat_models[fold].predict_proba(X_test)/5

In [71]:
submit

Unnamed: 0,index,0,1,2
0,26457,0.101492,0.126799,0.771709
1,26458,0.119065,0.133083,0.747853
2,26459,0.087422,0.134535,0.778043
3,26460,0.119069,0.129180,0.751750
4,26461,0.132817,0.176088,0.691095
...,...,...,...,...
9995,36452,0.103484,0.207399,0.689116
9996,36453,0.096491,0.173502,0.730007
9997,36454,0.076300,0.198735,0.724965
9998,36455,0.093400,0.146454,0.760146


In [72]:
submit.to_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/preprocessing/catboost_optuna_basic_cv20.csv', index = False)