In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

import datetime

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
train = pd.read_csv("Datasets/train_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')
test = pd.read_csv("Datasets/test_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')

In [3]:
dt = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']

In [4]:
def date_transformation(df):
 
    dt = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']
    
    for col in dt:
        
        if df[col].dtype == 'object':
            
            df[col] = df[col].str.replace(':00:00:00', '')
            df[col] = pd.to_datetime(df[col], format='%d%b%Y', errors='coerce')
            
date_transformation(train)
date_transformation(test)

In [5]:
train[dt].dtypes

SIGN_DATE            datetime64[ns]
DATEFIRSTREG         datetime64[ns]
TAXREG_REGDATE       datetime64[ns]
TAXREGPAY_REGDATE    datetime64[ns]
BIRTHDATE            datetime64[ns]
dtype: object

## Transforming Categorial Cols

In [6]:
ct = train.columns[train.dtypes == 'object']
ct

Index(['OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP', 'WORKERSRANGE',
       'OKVED_CODE', 'SEX_NAME', 'CITIZENSHIP_NAME'],
      dtype='object')

In [7]:
train[ct] = train[ct].fillna('unknown')
test[ct] = test[ct].fillna('unknown')

## Dataset Transformations:

In [8]:
def transformations(df):
    
    # времени с прошедших n заявок
    for i in range(-3, 4):
        
        if i != 0:
            df['SIGN_DATE_diff' + str(i)] = df.groupby('id_client')['SIGN_DATE'].diff(i).fillna(pd.Timedelta(0)).astype(int).reset_index(level = 0, drop = True)
    
    
    df['weekday'] = df['SIGN_DATE'].transform(lambda x : x.weekday())
    early_time = pd.to_datetime('2018-01-01').value
    
    for col in dt[:-1]:
        
        df[col + "_num"] = df[col].apply(lambda x : x.value - early_time)
        
    #df['month_num'] = df['SIGN_DATE'].transform(lambda x : x.month)
    df['month_num'] = df['SIGN_DATE'].transform(lambda x : ((x.month + 2) % 12) < 5).astype(int)
    #df['month_num'] = df['SIGN_DATE'].transform(lambda x : ((month_names.index(x[2:5]) + 2) % 12) < 5).astype(int)

    for col in dt:
        
        df[col].fillna(pd.to_datetime('2018-01-01'), inplace = True)
    
    df['SIGN_FIRST'] = (df['SIGN_DATE'] - df['DATEFIRSTREG']).dt.days.astype(int)
    df['TAXREGS'] = (df['TAXREGPAY_REGDATE'] - df['TAXREG_REGDATE']).dt.days.astype(int)
    
    return df

In [9]:
train = transformations(train)
test = transformations(test)

In [10]:
np.unique(train['weekday'], return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([5315, 3890, 2958, 2404, 2766,  393,  165]))

## CatBoost

In [11]:
del_cols = ['FLAG_DISQUALIFICATION', 'SUM_95_EVER', 'F1400', 'F1410']

In [12]:
feat_cols = [col for col in train.columns if col not in (['id_contract', 'TARGET'] + dt + del_cols)]

In [13]:
feat_cols.__len__()

127

In [14]:
train[feat_cols].dtypes

id_client                  int64
IP_flag                    int64
F1100                    float64
F1110                    float64
F1150                    float64
                          ...   
TAXREG_REGDATE_num        object
TAXREGPAY_REGDATE_num     object
month_num                  int64
SIGN_FIRST                 int64
TAXREGS                    int64
Length: 127, dtype: object

In [59]:
params = {
    #'task_type' : 'GPU',
    #'n_estimators' : 125,
    #'max_depth' : 13,
    #'learning_rate' : 0.062516,
    'random_strength' : 10,
    'grow_policy': 'SymmetricTree'   
}

In [60]:
val = train.loc[train['SIGN_DATE'] > pd.to_datetime('2019-01-01')]
train1 = train.loc[train['SIGN_DATE'] <= pd.to_datetime('2019-01-01')]

In [61]:
val_target = train.loc[train['SIGN_DATE'] > pd.to_datetime('2019-01-01'), 'TARGET']
target1 = train.loc[train['SIGN_DATE'] <= pd.to_datetime('2019-01-01'), 'TARGET']

In [62]:
model = CatBoostClassifier(cat_features=ct.to_list(), objective='Logloss', **params)
#model = CatBoostClassifier(objective = 'Logloss')

In [63]:
model.fit(val, val_target, eval_set=((train1, target1)))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.043196
0:	learn: 0.5613205	test: 0.5627390	best: 0.5627390 (0)	total: 55.5ms	remaining: 55.4s
1:	learn: 0.4454132	test: 0.4468846	best: 0.4468846 (1)	total: 95.5ms	remaining: 47.6s
2:	learn: 0.3559397	test: 0.3563148	best: 0.3563148 (2)	total: 112ms	remaining: 37.4s
3:	learn: 0.2871119	test: 0.2888311	best: 0.2888311 (3)	total: 133ms	remaining: 33.2s
4:	learn: 0.2314772	test: 0.2439741	best: 0.2439741 (4)	total: 145ms	remaining: 28.9s
5:	learn: 0.1890765	test: 0.1996611	best: 0.1996611 (5)	total: 158ms	remaining: 26.2s
6:	learn: 0.1536955	test: 0.1627121	best: 0.1627121 (6)	total: 173ms	remaining: 24.6s
7:	learn: 0.1240168	test: 0.1317679	best: 0.1317679 (7)	total: 185ms	remaining: 22.9s
8:	learn: 0.1029197	test: 0.1103085	best: 0.1103085 (8)	total: 201ms	remaining: 22.1s
9:	learn: 0.0836976	test: 0.0903229	best: 0.0903229 (9)	total: 216ms	remaining: 21.4s
10:	learn: 0.0708427	test: 0.0771620	best: 0.0771620 (10)	total: 228ms	remaining: 20.5s
11:	learn: 0.0590349

95:	learn: 0.0004819	test: 0.0008785	best: 0.0008785 (95)	total: 1.44s	remaining: 13.6s
96:	learn: 0.0004652	test: 0.0008575	best: 0.0008575 (96)	total: 1.45s	remaining: 13.5s
97:	learn: 0.0004504	test: 0.0008384	best: 0.0008384 (97)	total: 1.46s	remaining: 13.4s
98:	learn: 0.0004504	test: 0.0008384	best: 0.0008384 (98)	total: 1.47s	remaining: 13.4s
99:	learn: 0.0004504	test: 0.0008384	best: 0.0008384 (99)	total: 1.49s	remaining: 13.4s
100:	learn: 0.0004504	test: 0.0008384	best: 0.0008384 (100)	total: 1.5s	remaining: 13.4s
101:	learn: 0.0004503	test: 0.0008384	best: 0.0008384 (101)	total: 1.52s	remaining: 13.3s
102:	learn: 0.0004397	test: 0.0008196	best: 0.0008196 (102)	total: 1.53s	remaining: 13.3s
103:	learn: 0.0004280	test: 0.0008056	best: 0.0008056 (103)	total: 1.54s	remaining: 13.3s
104:	learn: 0.0004146	test: 0.0007888	best: 0.0007888 (104)	total: 1.55s	remaining: 13.2s
105:	learn: 0.0004146	test: 0.0007888	best: 0.0007888 (104)	total: 1.55s	remaining: 13.1s
106:	learn: 0.0004146

190:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (190)	total: 2.7s	remaining: 11.4s
191:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (191)	total: 2.71s	remaining: 11.4s
192:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (192)	total: 2.72s	remaining: 11.4s
193:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (193)	total: 2.74s	remaining: 11.4s
194:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (194)	total: 2.74s	remaining: 11.3s
195:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (194)	total: 2.76s	remaining: 11.3s
196:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (194)	total: 2.77s	remaining: 11.3s
197:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (197)	total: 2.78s	remaining: 11.3s
198:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (198)	total: 2.8s	remaining: 11.3s
199:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (199)	total: 2.81s	remaining: 11.2s
200:	learn: 0.0003174	test: 0.0006176	best: 0.0006176 (200)	total: 2.83s	remaining: 11.2s
201:	learn: 

293:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (293)	total: 4.14s	remaining: 9.94s
294:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (294)	total: 4.15s	remaining: 9.92s
295:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (295)	total: 4.17s	remaining: 9.91s
296:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (296)	total: 4.18s	remaining: 9.9s
297:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (297)	total: 4.2s	remaining: 9.89s
298:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (298)	total: 4.21s	remaining: 9.88s
299:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (299)	total: 4.22s	remaining: 9.86s
300:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (300)	total: 4.24s	remaining: 9.85s
301:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (301)	total: 4.26s	remaining: 9.84s
302:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (302)	total: 4.27s	remaining: 9.83s
303:	learn: 0.0003170	test: 0.0006172	best: 0.0006172 (302)	total: 4.29s	remaining: 9.81s
304:	learn: 

386:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (386)	total: 5.39s	remaining: 8.55s
387:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (387)	total: 5.41s	remaining: 8.53s
388:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (388)	total: 5.42s	remaining: 8.52s
389:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (389)	total: 5.43s	remaining: 8.5s
390:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (390)	total: 5.45s	remaining: 8.48s
391:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (391)	total: 5.46s	remaining: 8.47s
392:	learn: 0.0003167	test: 0.0006168	best: 0.0006168 (391)	total: 5.48s	remaining: 8.46s
393:	learn: 0.0003167	test: 0.0006167	best: 0.0006167 (393)	total: 5.49s	remaining: 8.44s
394:	learn: 0.0003167	test: 0.0006167	best: 0.0006167 (394)	total: 5.5s	remaining: 8.43s
395:	learn: 0.0003166	test: 0.0006167	best: 0.0006167 (395)	total: 5.52s	remaining: 8.42s
396:	learn: 0.0003166	test: 0.0006167	best: 0.0006167 (396)	total: 5.53s	remaining: 8.4s
397:	learn: 0

487:	learn: 0.0003164	test: 0.0006164	best: 0.0006164 (487)	total: 6.62s	remaining: 6.95s
488:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (488)	total: 6.63s	remaining: 6.93s
489:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (489)	total: 6.65s	remaining: 6.92s
490:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (490)	total: 6.66s	remaining: 6.9s
491:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (491)	total: 6.67s	remaining: 6.89s
492:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (492)	total: 6.69s	remaining: 6.88s
493:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (493)	total: 6.7s	remaining: 6.86s
494:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (494)	total: 6.71s	remaining: 6.84s
495:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (494)	total: 6.72s	remaining: 6.83s
496:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (496)	total: 6.74s	remaining: 6.82s
497:	learn: 0.0003163	test: 0.0006164	best: 0.0006164 (497)	total: 6.75s	remaining: 6.8s
498:	learn: 0

588:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (587)	total: 7.85s	remaining: 5.48s
589:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (587)	total: 7.87s	remaining: 5.47s
590:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (590)	total: 7.88s	remaining: 5.45s
591:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (591)	total: 7.89s	remaining: 5.44s
592:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (592)	total: 7.91s	remaining: 5.43s
593:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (592)	total: 7.92s	remaining: 5.41s
594:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (594)	total: 7.93s	remaining: 5.4s
595:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (595)	total: 7.94s	remaining: 5.38s
596:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (596)	total: 7.95s	remaining: 5.37s
597:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (597)	total: 7.96s	remaining: 5.35s
598:	learn: 0.0003161	test: 0.0006161	best: 0.0006161 (597)	total: 7.98s	remaining: 5.34s
599:	learn:

683:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (683)	total: 9.09s	remaining: 4.2s
684:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (684)	total: 9.11s	remaining: 4.19s
685:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (685)	total: 9.13s	remaining: 4.18s
686:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (686)	total: 9.14s	remaining: 4.16s
687:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (687)	total: 9.15s	remaining: 4.15s
688:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (688)	total: 9.16s	remaining: 4.13s
689:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (689)	total: 9.17s	remaining: 4.12s
690:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (690)	total: 9.18s	remaining: 4.1s
691:	learn: 0.0003158	test: 0.0006157	best: 0.0006157 (691)	total: 9.19s	remaining: 4.09s
692:	learn: 0.0003157	test: 0.0006157	best: 0.0006157 (692)	total: 9.2s	remaining: 4.08s
693:	learn: 0.0003157	test: 0.0006157	best: 0.0006157 (693)	total: 9.21s	remaining: 4.06s
694:	learn: 0

780:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (780)	total: 10.3s	remaining: 2.9s
781:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (781)	total: 10.3s	remaining: 2.88s
782:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (782)	total: 10.4s	remaining: 2.87s
783:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (783)	total: 10.4s	remaining: 2.86s
784:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (784)	total: 10.4s	remaining: 2.84s
785:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (785)	total: 10.4s	remaining: 2.83s
786:	learn: 0.0003156	test: 0.0006155	best: 0.0006155 (786)	total: 10.4s	remaining: 2.82s
787:	learn: 0.0003155	test: 0.0006155	best: 0.0006155 (787)	total: 10.4s	remaining: 2.8s
788:	learn: 0.0003155	test: 0.0006155	best: 0.0006155 (788)	total: 10.4s	remaining: 2.79s
789:	learn: 0.0003155	test: 0.0006155	best: 0.0006155 (789)	total: 10.4s	remaining: 2.77s
790:	learn: 0.0003155	test: 0.0006155	best: 0.0006155 (790)	total: 10.5s	remaining: 2.76s
791:	learn: 

877:	learn: 0.0003153	test: 0.0006152	best: 0.0006152 (877)	total: 11.6s	remaining: 1.6s
878:	learn: 0.0003153	test: 0.0006152	best: 0.0006152 (878)	total: 11.6s	remaining: 1.59s
879:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (879)	total: 11.6s	remaining: 1.58s
880:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (880)	total: 11.6s	remaining: 1.56s
881:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (881)	total: 11.6s	remaining: 1.55s
882:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (882)	total: 11.6s	remaining: 1.54s
883:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (883)	total: 11.6s	remaining: 1.52s
884:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (884)	total: 11.6s	remaining: 1.51s
885:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (884)	total: 11.6s	remaining: 1.5s
886:	learn: 0.0003153	test: 0.0006151	best: 0.0006151 (886)	total: 11.7s	remaining: 1.49s
887:	learn: 0.0003152	test: 0.0006151	best: 0.0006151 (887)	total: 11.7s	remaining: 1.47s
888:	learn: 

972:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (970)	total: 12.8s	remaining: 355ms
973:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (973)	total: 12.8s	remaining: 342ms
974:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (974)	total: 12.8s	remaining: 328ms
975:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (975)	total: 12.8s	remaining: 315ms
976:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (975)	total: 12.8s	remaining: 302ms
977:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (977)	total: 12.8s	remaining: 289ms
978:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (978)	total: 12.9s	remaining: 276ms
979:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (979)	total: 12.9s	remaining: 263ms
980:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (979)	total: 12.9s	remaining: 249ms
981:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (981)	total: 12.9s	remaining: 236ms
982:	learn: 0.0003150	test: 0.0006148	best: 0.0006148 (982)	total: 12.9s	remaining: 223ms
983:	learn

<catboost.core.CatBoostClassifier at 0x7efd72285b20>