In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [43]:
train = pd.read_csv('../data/cat_in_the_dat_train.csv', index_col='id')
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')

In [5]:
train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 299999
Data columns (total 24 columns):
bin_0     300000 non-null int64
bin_1     300000 non-null int64
bin_2     300000 non-null int64
bin_3     300000 non-null object
bin_4     300000 non-null object
nom_0     300000 non-null object
nom_1     300000 non-null object
nom_2     300000 non-null object
nom_3     300000 non-null object
nom_4     300000 non-null object
nom_5     300000 non-null object
nom_6     300000 non-null object
nom_7     300000 non-null object
nom_8     300000 non-null object
nom_9     300000 non-null object
ord_0     300000 non-null int64
ord_1     300000 non-null object
ord_2     300000 non-null object
ord_3     300000 non-null object
ord_4     300000 non-null object
ord_5     300000 non-null object
day       300000 non-null int64
month     300000 non-null int64
target    300000 non-null int64
dtypes: int64(7), object(17)
memory usage: 57.2+ MB


In [44]:
Y = train['target']
X = train.drop('target', axis=1)

In [42]:
def fit_result(X, Y,
              n_estimators,
              learning_rate,
              max_depth):
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=17)

    XGB = XGBClassifier(nthread=-1, 
                        n_estimators=n_estimators, 
                        learning_rate=learning_rate, 
                        max_depth=max_depth, 
                        subsample=0.9, 
                        colsample_bytree=0.9)
    
    XGB.fit(X_train, y_train, verbose=200)
    xgb_pred = XGB.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, xgb_pred)


## Label encoding

In [45]:
categ_cols = X.select_dtypes(include=['object']).columns.to_list()

In [46]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [47]:
X[categ_cols] = X[categ_cols].apply(lambda x: le.fit_transform(x))
X.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,1,1,5,5,3,0,...,1686,2175,2,2,1,7,3,136,2,2
1,0,1,0,1,1,1,4,3,5,2,...,650,11635,1,2,3,0,0,93,7,8
2,0,0,0,0,1,0,4,4,5,3,...,1932,8078,1,1,4,7,17,31,7,2
3,0,1,0,0,1,2,4,5,0,1,...,629,6056,1,2,0,8,3,134,2,1
4,0,0,0,0,0,2,4,4,0,1,...,1760,8231,1,2,2,0,17,158,7,8


In [51]:
print("tunning learning_rate")
dic = dict()
for l in [0.1, 0.2, 0.3]:
    roc_auc = fit_result(X, Y, 
                 n_estimators=500, 
                 learning_rate=l, 
                 max_depth=3)
    dic[l] = roc_auc
    
dic 

tunning learning_rate


{0.1: 0.7748812042206865, 0.2: 0.7802657459123178, 0.3: 0.7817518415360722}

In [52]:
print("tunning max_depth")
dic = dict()
for m in [3, 4, 5, 6]:
    roc_auc = fit_result(X, Y, 
                 n_estimators=500, 
                 learning_rate=0.1, 
                 max_depth=m)
    dic[m] = roc_auc
    
dic 

tunning max_depth


{3: 0.7748812042206865,
 4: 0.7771718693792293,
 5: 0.7781629894327895,
 6: 0.7778414586582868}

## One hot

In [58]:
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder()

In [59]:
train = pd.read_csv('../data/cat_in_the_dat_train.csv', index_col='id')
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')
Y = train['target']
X = train.drop('target', axis=1)

In [60]:
X = one.fit_transform(X)

In [61]:
print("tunning learning_rate")
dic = dict()
for l in [0.1, 0.2, 0.3]:
    roc_auc = fit_result(X, Y, 
                 n_estimators=500, 
                 learning_rate=l, 
                 max_depth=3)
    dic[l] = roc_auc
    
dic 

tunning learning_rate


{0.1: 0.7640804840178449, 0.2: 0.774667781780568, 0.3: 0.7782865779450621}

In [63]:
print("tunning max_depth")
dic = dict()
for m in [7, 9, 11, 13]:
    roc_auc = fit_result(X, Y, 
                 n_estimators=500, 
                 learning_rate=0.1, 
                 max_depth=m)
    dic[m] = roc_auc
    
dic 

tunning max_depth


{7: 0.7724984582977144,
 9: 0.7721514945772938,
 11: 0.7712787277817651,
 13: 0.7694221564511936}

## Target encoding

In [64]:
train = pd.read_csv('../data/cat_in_the_dat_train.csv', index_col='id')
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')
train['day']=train['day'].astype('object')
train['month']=train['month'].astype('object')

In [66]:
categ_cols = train.select_dtypes(include=['object']).columns.to_list()

In [69]:
for col in categ_cols:
    train[col] = train[col].map(train.groupby(col)['target'].mean())

In [70]:
train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0.302537,0.290107,0.327145,0.360978,0.307162,0.242813,0.237743,...,0.368421,2,0.403885,0.257877,0.306993,0.208354,0.401186,0.322048,0.244432,0
1,0,1,0,0.302537,0.290107,0.327145,0.290054,0.359209,0.289954,0.304164,...,0.076923,1,0.403885,0.326315,0.206599,0.186877,0.30388,0.340292,0.327496,0
2,0,0,0,0.309384,0.290107,0.24179,0.290054,0.293085,0.289954,0.353951,...,0.172414,1,0.317175,0.403126,0.306993,0.351864,0.206843,0.340292,0.244432,0
3,0,1,0,0.309384,0.290107,0.351052,0.290054,0.307162,0.339793,0.329472,...,0.227273,1,0.403885,0.360961,0.330148,0.208354,0.355985,0.322048,0.255729,1
4,0,0,0,0.309384,0.333773,0.351052,0.290054,0.293085,0.339793,0.329472,...,0.2,1,0.403885,0.225214,0.206599,0.351864,0.404345,0.340292,0.327496,0


In [71]:
Y = train['target']
X = train.drop('target', axis=1)

In [72]:
print("tunning learning_rate")
dic = dict()
for l in [0.1, 0.2, 0.3]:
    roc_auc = fit_result(X, Y, 
                 n_estimators=500, 
                 learning_rate=l, 
                 max_depth=3)
    dic[l] = roc_auc
    
dic 

tunning learning_rate


{0.1: 0.8309870253690351, 0.2: 0.8294146037197574, 0.3: 0.8269541272511554}

## Final

In [24]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from hyperopt import hp, tpe, space_eval
from hyperopt.fmin import fmin

In [43]:
train = pd.read_csv('../data/cat_in_the_dat_train.csv', index_col='id')
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')
train['day']=train['day'].astype('object')
train['month']=train['month'].astype('object')

In [44]:
categ_cols = train.select_dtypes(include=['object']).columns.to_list()
for col in categ_cols:
    train[col] = train[col].map(train.groupby(col)['target'].mean())

In [45]:
Y = train['target']
X = train.drop('target', axis=1)

In [19]:
def function(params):
    params = {
        'learning_rate': params['learning_rate'], 
        'max_depth': params['max_depth'], 
        'subsample': params['subsample'], # Доля фичей
        'colsample_bytree': params['colsample_bytree'] # Доля объектов
    }
    
    print("############## RUN ################")
    print("params = {params}".format(params=params))
    
    LGBM = LGBMClassifier(
        n_jobs=-1, 
        n_estimators=500,
        verbose=200,
        **params)
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=17)

       
    LGBM.fit(X_train, y_train)
    pred = LGBM.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, pred)
    print("Score: {score}".format(score=str(score)))
    
    return -score
    
    

In [20]:
lgbm_space =  {
            'learning_rate': hp.quniform('learning_rate', 0.1, 0.5, 0.1),
            'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
            'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.1)
    }

In [21]:
best = fmin(fn=function,
            space=lgbm_space,
            # tpe - Tree of Parzen Estimators (TPE)
            algo=tpe.suggest,
            max_evals=50
           )

############## RUN ################                 
params = {'subsample': 0.6000000000000001, 'learning_rate': 0.4, 'colsample_bytree': 0.9, 'max_depth': 9}
Score: 0.8076037327952863                           
############## RUN ################                                          
params = {'subsample': 0.6000000000000001, 'learning_rate': 0.30000000000000004, 'colsample_bytree': 0.6000000000000001, 'max_depth': 11}
Score: 0.8167965095863163                                                    
############## RUN ################                                          
params = {'subsample': 0.6000000000000001, 'learning_rate': 0.2, 'colsample_bytree': 0.7000000000000001, 'max_depth': 5}
Score: 0.8235787409026047                                                    
############## RUN ################                                          
params = {'subsample': 1.0, 'learning_rate': 0.30000000000000004, 'colsample_bytree': 0.7000000000000001, 'max_depth': 12}
Score: 0.8159380

In [25]:
best_params = space_eval(lgbm_space, best)
#best_params['max_depth'] = int(best_params['max_depth'])
best_params

{'colsample_bytree': 0.7000000000000001,
 'learning_rate': 0.30000000000000004,
 'max_depth': 1,
 'subsample': 0.7000000000000001}

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=17)

In [49]:
LGBM = LGBMClassifier(
        n_jobs=-1, 
        n_estimators=5000,
        verbose=100,
        **best_params
)

LGBM.fit(X_train, 
        y_train,
        eval_metric=['auc'],
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=100)

[1]	valid_0's auc: 0.584435	valid_0's binary_logloss: 0.609584
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.635724	valid_0's binary_logloss: 0.601679
[3]	valid_0's auc: 0.662448	valid_0's binary_logloss: 0.596432
[4]	valid_0's auc: 0.665231	valid_0's binary_logloss: 0.591652
[5]	valid_0's auc: 0.684452	valid_0's binary_logloss: 0.58651
[6]	valid_0's auc: 0.700211	valid_0's binary_logloss: 0.581926
[7]	valid_0's auc: 0.713198	valid_0's binary_logloss: 0.578573
[8]	valid_0's auc: 0.709659	valid_0's binary_logloss: 0.575494
[9]	valid_0's auc: 0.715195	valid_0's binary_logloss: 0.572057
[10]	valid_0's auc: 0.724778	valid_0's binary_logloss: 0.569048
[11]	valid_0's auc: 0.733125	valid_0's binary_logloss: 0.566391
[12]	valid_0's auc: 0.742462	valid_0's binary_logloss: 0.563164
[13]	valid_0's auc: 0.745042	valid_0's binary_logloss: 0.560469
[14]	valid_0's auc: 0.747244	valid_0's binary_logloss: 0.557888
[15]	valid_0's auc: 0.753503	valid_0's binary_loglos

[129]	valid_0's auc: 0.826816	valid_0's binary_logloss: 0.468308
[130]	valid_0's auc: 0.826857	valid_0's binary_logloss: 0.468161
[131]	valid_0's auc: 0.826862	valid_0's binary_logloss: 0.46799
[132]	valid_0's auc: 0.826962	valid_0's binary_logloss: 0.467738
[133]	valid_0's auc: 0.827074	valid_0's binary_logloss: 0.467543
[134]	valid_0's auc: 0.827237	valid_0's binary_logloss: 0.467356
[135]	valid_0's auc: 0.827252	valid_0's binary_logloss: 0.467132
[136]	valid_0's auc: 0.827329	valid_0's binary_logloss: 0.466935
[137]	valid_0's auc: 0.827406	valid_0's binary_logloss: 0.466765
[138]	valid_0's auc: 0.827487	valid_0's binary_logloss: 0.466569
[139]	valid_0's auc: 0.827597	valid_0's binary_logloss: 0.466365
[140]	valid_0's auc: 0.827648	valid_0's binary_logloss: 0.466211
[141]	valid_0's auc: 0.827708	valid_0's binary_logloss: 0.465987
[142]	valid_0's auc: 0.827654	valid_0's binary_logloss: 0.4658
[143]	valid_0's auc: 0.827906	valid_0's binary_logloss: 0.465558
[144]	valid_0's auc: 0.82807

[257]	valid_0's auc: 0.831432	valid_0's binary_logloss: 0.45523
[258]	valid_0's auc: 0.831447	valid_0's binary_logloss: 0.455173
[259]	valid_0's auc: 0.831429	valid_0's binary_logloss: 0.455154
[260]	valid_0's auc: 0.831467	valid_0's binary_logloss: 0.455101
[261]	valid_0's auc: 0.831473	valid_0's binary_logloss: 0.455061
[262]	valid_0's auc: 0.831469	valid_0's binary_logloss: 0.455042
[263]	valid_0's auc: 0.831487	valid_0's binary_logloss: 0.455009
[264]	valid_0's auc: 0.831494	valid_0's binary_logloss: 0.454957
[265]	valid_0's auc: 0.831528	valid_0's binary_logloss: 0.454899
[266]	valid_0's auc: 0.831503	valid_0's binary_logloss: 0.454897
[267]	valid_0's auc: 0.831509	valid_0's binary_logloss: 0.454842
[268]	valid_0's auc: 0.831534	valid_0's binary_logloss: 0.454789
[269]	valid_0's auc: 0.831515	valid_0's binary_logloss: 0.45475
[270]	valid_0's auc: 0.831519	valid_0's binary_logloss: 0.454705
[271]	valid_0's auc: 0.831563	valid_0's binary_logloss: 0.454658
[272]	valid_0's auc: 0.8315

[385]	valid_0's auc: 0.832224	valid_0's binary_logloss: 0.45247
[386]	valid_0's auc: 0.832225	valid_0's binary_logloss: 0.452459
[387]	valid_0's auc: 0.832227	valid_0's binary_logloss: 0.452455
[388]	valid_0's auc: 0.832226	valid_0's binary_logloss: 0.452452
[389]	valid_0's auc: 0.832221	valid_0's binary_logloss: 0.452451
[390]	valid_0's auc: 0.832236	valid_0's binary_logloss: 0.452432
[391]	valid_0's auc: 0.832225	valid_0's binary_logloss: 0.452437
[392]	valid_0's auc: 0.83221	valid_0's binary_logloss: 0.452446
[393]	valid_0's auc: 0.832204	valid_0's binary_logloss: 0.452444
[394]	valid_0's auc: 0.832209	valid_0's binary_logloss: 0.452426
[395]	valid_0's auc: 0.832226	valid_0's binary_logloss: 0.452402
[396]	valid_0's auc: 0.832221	valid_0's binary_logloss: 0.452396
[397]	valid_0's auc: 0.832209	valid_0's binary_logloss: 0.452397
[398]	valid_0's auc: 0.832217	valid_0's binary_logloss: 0.452385
[399]	valid_0's auc: 0.832223	valid_0's binary_logloss: 0.452368
[400]	valid_0's auc: 0.8322

[513]	valid_0's auc: 0.832411	valid_0's binary_logloss: 0.45181
[514]	valid_0's auc: 0.832413	valid_0's binary_logloss: 0.451803
[515]	valid_0's auc: 0.832414	valid_0's binary_logloss: 0.4518
[516]	valid_0's auc: 0.832414	valid_0's binary_logloss: 0.4518
[517]	valid_0's auc: 0.83241	valid_0's binary_logloss: 0.451803
[518]	valid_0's auc: 0.832412	valid_0's binary_logloss: 0.451799
[519]	valid_0's auc: 0.832408	valid_0's binary_logloss: 0.451801
[520]	valid_0's auc: 0.832407	valid_0's binary_logloss: 0.451802
[521]	valid_0's auc: 0.832403	valid_0's binary_logloss: 0.451805
[522]	valid_0's auc: 0.832398	valid_0's binary_logloss: 0.451809
[523]	valid_0's auc: 0.832398	valid_0's binary_logloss: 0.451808
[524]	valid_0's auc: 0.832402	valid_0's binary_logloss: 0.451802
[525]	valid_0's auc: 0.832407	valid_0's binary_logloss: 0.451797
[526]	valid_0's auc: 0.832409	valid_0's binary_logloss: 0.451794
[527]	valid_0's auc: 0.832405	valid_0's binary_logloss: 0.451795
[528]	valid_0's auc: 0.832414	v

[641]	valid_0's auc: 0.83249	valid_0's binary_logloss: 0.451632
[642]	valid_0's auc: 0.83249	valid_0's binary_logloss: 0.451631
[643]	valid_0's auc: 0.832493	valid_0's binary_logloss: 0.451627
[644]	valid_0's auc: 0.83249	valid_0's binary_logloss: 0.45163
[645]	valid_0's auc: 0.832489	valid_0's binary_logloss: 0.45163
[646]	valid_0's auc: 0.832487	valid_0's binary_logloss: 0.451632
[647]	valid_0's auc: 0.832485	valid_0's binary_logloss: 0.451633
[648]	valid_0's auc: 0.832489	valid_0's binary_logloss: 0.451629
[649]	valid_0's auc: 0.832492	valid_0's binary_logloss: 0.451625
[650]	valid_0's auc: 0.832494	valid_0's binary_logloss: 0.451623
[651]	valid_0's auc: 0.832489	valid_0's binary_logloss: 0.451628
[652]	valid_0's auc: 0.832489	valid_0's binary_logloss: 0.451628
[653]	valid_0's auc: 0.83249	valid_0's binary_logloss: 0.451627
[654]	valid_0's auc: 0.83249	valid_0's binary_logloss: 0.451626
[655]	valid_0's auc: 0.83249	valid_0's binary_logloss: 0.451626
[656]	valid_0's auc: 0.832491	val

[769]	valid_0's auc: 0.832525	valid_0's binary_logloss: 0.451574
[770]	valid_0's auc: 0.832529	valid_0's binary_logloss: 0.45157
[771]	valid_0's auc: 0.832528	valid_0's binary_logloss: 0.451571
[772]	valid_0's auc: 0.832526	valid_0's binary_logloss: 0.451573
[773]	valid_0's auc: 0.832526	valid_0's binary_logloss: 0.451573
[774]	valid_0's auc: 0.832527	valid_0's binary_logloss: 0.451573
[775]	valid_0's auc: 0.832528	valid_0's binary_logloss: 0.451571
[776]	valid_0's auc: 0.832531	valid_0's binary_logloss: 0.451568
[777]	valid_0's auc: 0.83253	valid_0's binary_logloss: 0.451569
[778]	valid_0's auc: 0.832528	valid_0's binary_logloss: 0.451571
[779]	valid_0's auc: 0.83253	valid_0's binary_logloss: 0.451569
[780]	valid_0's auc: 0.832529	valid_0's binary_logloss: 0.451569
[781]	valid_0's auc: 0.832529	valid_0's binary_logloss: 0.451569
[782]	valid_0's auc: 0.832529	valid_0's binary_logloss: 0.451569
[783]	valid_0's auc: 0.832527	valid_0's binary_logloss: 0.451571
[784]	valid_0's auc: 0.83252

[897]	valid_0's auc: 0.832549	valid_0's binary_logloss: 0.451545
[898]	valid_0's auc: 0.832548	valid_0's binary_logloss: 0.451547
[899]	valid_0's auc: 0.832548	valid_0's binary_logloss: 0.451547
[900]	valid_0's auc: 0.832548	valid_0's binary_logloss: 0.451546
[901]	valid_0's auc: 0.832548	valid_0's binary_logloss: 0.451547
[902]	valid_0's auc: 0.83255	valid_0's binary_logloss: 0.451544
[903]	valid_0's auc: 0.83255	valid_0's binary_logloss: 0.451545
[904]	valid_0's auc: 0.832549	valid_0's binary_logloss: 0.451545
[905]	valid_0's auc: 0.832551	valid_0's binary_logloss: 0.451543
[906]	valid_0's auc: 0.832549	valid_0's binary_logloss: 0.451545
[907]	valid_0's auc: 0.832549	valid_0's binary_logloss: 0.451545
[908]	valid_0's auc: 0.832551	valid_0's binary_logloss: 0.451544
[909]	valid_0's auc: 0.83255	valid_0's binary_logloss: 0.451545
[910]	valid_0's auc: 0.832552	valid_0's binary_logloss: 0.451543
[911]	valid_0's auc: 0.832551	valid_0's binary_logloss: 0.451545
[912]	valid_0's auc: 0.83255

[1025]	valid_0's auc: 0.832563	valid_0's binary_logloss: 0.451534
[1026]	valid_0's auc: 0.832564	valid_0's binary_logloss: 0.451532
[1027]	valid_0's auc: 0.832564	valid_0's binary_logloss: 0.451533
[1028]	valid_0's auc: 0.832563	valid_0's binary_logloss: 0.451534
[1029]	valid_0's auc: 0.832565	valid_0's binary_logloss: 0.451532
[1030]	valid_0's auc: 0.832565	valid_0's binary_logloss: 0.451531
[1031]	valid_0's auc: 0.832564	valid_0's binary_logloss: 0.451533
[1032]	valid_0's auc: 0.832561	valid_0's binary_logloss: 0.451536
[1033]	valid_0's auc: 0.832561	valid_0's binary_logloss: 0.451537
[1034]	valid_0's auc: 0.832561	valid_0's binary_logloss: 0.451537
[1035]	valid_0's auc: 0.832559	valid_0's binary_logloss: 0.451539
[1036]	valid_0's auc: 0.83256	valid_0's binary_logloss: 0.451538
[1037]	valid_0's auc: 0.83256	valid_0's binary_logloss: 0.451537
[1038]	valid_0's auc: 0.832557	valid_0's binary_logloss: 0.45154
[1039]	valid_0's auc: 0.832558	valid_0's binary_logloss: 0.45154
[1040]	valid_0

[1151]	valid_0's auc: 0.83257	valid_0's binary_logloss: 0.451529
[1152]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.45153
[1153]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.451531
[1154]	valid_0's auc: 0.832567	valid_0's binary_logloss: 0.45153
[1155]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.451529
[1156]	valid_0's auc: 0.832569	valid_0's binary_logloss: 0.451529
[1157]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.45153
[1158]	valid_0's auc: 0.832567	valid_0's binary_logloss: 0.451531
[1159]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.451529
[1160]	valid_0's auc: 0.832567	valid_0's binary_logloss: 0.451531
[1161]	valid_0's auc: 0.832569	valid_0's binary_logloss: 0.451529
[1162]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.451529
[1163]	valid_0's auc: 0.832569	valid_0's binary_logloss: 0.451529
[1164]	valid_0's auc: 0.832568	valid_0's binary_logloss: 0.451531
[1165]	valid_0's auc: 0.832567	valid_0's binary_logloss: 0.451531
[1166]	valid_0

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
        importance_type='split', learning_rate=0.3, max_depth=1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=5000, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=0.7, subsample_for_bin=200000, subsample_freq=0,
        verbose=100)

In [34]:
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')
categ_cols = train.select_dtypes(include=['object']).columns.to_list()
for col in categ_cols:
    test[col] = test[col].map(train.groupby(col)['target'].mean())
    
test.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300000,0,0,1,0.302537,0.290107,0.24179,0.360978,0.319017,0.242813,0.304164,...,0.348315,0.181818,2,0.242055,0.288796,0.342476,0.324947,0.300588,0.273121,0.364099
300001,0,0,0,0.302537,0.333773,0.351052,0.338932,0.293085,0.339793,0.304164,...,0.222707,0.288889,1,0.355078,0.403126,0.379277,0.186877,0.244795,0.340292,0.317053
300002,1,0,1,0.309384,0.290107,0.24179,0.338932,0.245139,0.311724,0.304164,...,0.186667,0.090909,2,0.317175,0.225214,0.206599,0.236891,0.417726,0.336113,0.353634
300003,0,0,1,0.302537,0.290107,0.351052,0.310627,0.335367,0.311724,0.304164,...,0.360656,0.32,1,0.278533,0.403126,0.22046,0.336264,0.365151,0.322048,0.280936
300004,0,1,1,0.309384,0.333773,0.351052,0.290054,0.245139,0.311724,0.304164,...,0.375,0.294118,3,0.403885,0.403126,0.379277,0.409481,0.389864,0.252139,0.364099


In [35]:
y_preds = LGBM.predict_proba(test)[:,1] 
y_preds

array([0.20212615, 0.65805746, 0.02674929, ..., 0.43007302, 0.56577997,
       0.2195374 ])

In [36]:
test = pd.read_csv('../data/cat_in_the_dat_test.csv')
test["target"] = y_preds
submission = test[["id", "target"]]
submission.head()

Unnamed: 0,id,target
0,300000,0.202126
1,300001,0.658057
2,300002,0.026749
3,300003,0.580573
4,300004,0.838201


In [39]:
submission.to_csv('../data/Submissions/cat_in_the_dat_sub1.csv', index=False)