In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

import datetime

from warnings import filterwarnings
filterwarnings('ignore')

## Downloading Data

In [13]:
train = pd.read_csv("Datasets/train_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')
test = pd.read_csv("Datasets/test_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')

In [19]:
train['OKVED_CODE'] = train['OKVED_CODE'].fillna("-1").transform(lambda x : x.split(".")[0]).replace({"-1" : "unknown"})
test['OKVED_CODE'] = test['OKVED_CODE'].fillna("-1").transform(lambda x : x.split(".")[0]).replace({"-1" : "unknown"})

0             35
1        unknown
2             36
3             35
4        unknown
          ...   
17886         36
17887         86
17888         46
17889         46
17890         43
Name: OKVED_CODE, Length: 17891, dtype: object

In [28]:
train['OKTMO_CODE'] = train['OKTMO_CODE'].fillna("00000000000").astype(str).transform(lambda x : x[:3])
test['OKTMO_CODE'] = test['OKTMO_CODE'].fillna("00000000000").astype(str).transform(lambda x : x[:3])

0        370
1        000
2        337
3        718
4        000
        ... 
17886    176
17887    987
17888    453
17889    170
17890    117
Name: OKTMO_CODE, Length: 17891, dtype: object

## Transforming Datetime cols

In [6]:
dt = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']

In [7]:
def date_transformation(df):
 
    dt = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']
    
    for col in dt:
        
        if df[col].dtype == 'object':
            
            df[col] = df[col].str.replace(':00:00:00', '')
            df[col] = pd.to_datetime(df[col], format='%d%b%Y', errors='coerce')
            
date_transformation(train)
date_transformation(test)

## Transforming Categorial Cols

In [8]:
ct = train.columns[train.dtypes == 'object']
ct

Index(['OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP', 'WORKERSRANGE',
       'OKVED_CODE', 'SEX_NAME', 'CITIZENSHIP_NAME'],
      dtype='object')

In [9]:
train[ct] = train[ct].fillna('unknown')
test[ct] = test[ct].fillna('unknown')

In [10]:
F_cols = [col for col in train.columns if "F1" == col[:2] or "F2" == col[:2]]
#train[F_cols].fillna(0, inplace = True)
#test[F_cols].fillna(0, inplace = True)

In [12]:
train['OKVED_CODE'].unique()

array(['35.14', 'unknown', '36.00.2', '84.24', '35.12', '68.32.1',
       '35.30.14', '46.46.1', '35.30', '46.17', '80.10', '46.90', '06.20',
       '61.10', '35.11.1', '36.00', '61.10.1', '81.29.9', '37.00',
       '36.00.1', '46.3', '35.11', '35.30.3', '46.18.2', '41.20', '46.31',
       '38.22', '81.22', '10.71', '10.51', '56.10', '46.33', '37.0',
       '46.39', '47.19', '46.46', '65.12', '46.38', '52.24', '10.51.9',
       '38.1', '35.3', '49.41.2', '68.32', '35.13', '52.21.24', '10.13.1',
       '63.11.1', '01.13', '46.51', '46.62.2', '94.99', '47.73', '49.4',
       '49.3', '86', '46.18.11', '46.31.12', '33.12', '52.29', '46.32',
       '85.42', '86.90.1', '86.90.4', '86.10', '47.30', '43.29', '53.10',
       '18.13', '47.71', '96.01', '49.32', '43.2', '86.21', '62.01',
       '61.1', '46.46.2', '74.90', '46.32.1', '81.29.1', '58.13', '47.11',
       '93.29', '73.11', '46.42', '43.22', '46.39.1', '56.29', '43.21',
       '10.1', '35.23', '93.1', '35.30.1', '45.32', '46.49.33', '

## Dataset Transformations:

In [8]:
from feature_engine.creation import CyclicalTransformer
cyclical = CyclicalTransformer(variables = None, drop_original = True)

In [9]:
def transformations(df, train_flg = True):
    
    # времени с прошедших n заявок
    for i in range(-4, 5):
        
        if i != 0:
            df['SIGN_DATE_diff' + str(i)] = df.groupby('id_client')['SIGN_DATE'].diff(i).fillna(pd.Timedelta(0)).astype(int).reset_index(level = 0, drop = True)
    
    df['mean_time_diff'] = df.groupby('id_client')['SIGN_DATE_diff1'].transform('mean')
    df['app_num'] = df.groupby('id_client').cumcount()
    
    df['weekday'] = df['SIGN_DATE'].transform(lambda x : x.weekday()).astype(int)
    df['month_num'] = df['SIGN_DATE'].transform(lambda x : x.month).astype(int)
    
    df['taxreg_weekday'] = df['TAXREG_REGDATE'].transform(lambda x : x.weekday()).fillna(0)
    df['taxreg_month'] = df['TAXREG_REGDATE'].transform(lambda x : x.month).fillna(0)
    
    df['taxregpay_weekday'] = df['TAXREGPAY_REGDATE'].transform(lambda x : x.weekday()).fillna(0)
    df['taxregpay_month'] = df['TAXREGPAY_REGDATE'].transform(lambda x : x.month).fillna(0)
    
    df['datefirstreg_weekday'] = df['DATEFIRSTREG'].transform(lambda x : x.weekday()).fillna(0)
    df['datefirstreg_month'] = df['DATEFIRSTREG'].transform(lambda x : x.month).fillna(0)
    
    date_cols = ['weekday', 'month_num', 'taxreg_weekday', 'taxreg_month',
                 'taxregpay_weekday', 'taxregpay_month', 'datefirstreg_weekday', 'datefirstreg_month']
    
    sub_df = pd.DataFrame.copy(df[date_cols])
    
    if train_flg:
        
        res = cyclical.fit_transform(sub_df)
    
    else:
        
        res = cyclical.transform(sub_df)
        
    df.drop(columns = date_cols, inplace = True)
    
    for col in date_cols:
        
        df[col + "_sin"] = res[col + "_sin"]
    
    early_time = pd.to_datetime('2018-01-01').value
    
    for col in dt[:-1]:
        
        df[col + "_num"] = df[col].apply(lambda x : x.value - early_time)
        
    for col in dt:
        
        df[col].fillna(pd.to_datetime('2018-01-01'), inplace = True)
    
    return df

In [10]:
train = transformations(train, train_flg = True)
test = transformations(test, train_flg = False)

## CatBoost

In [11]:
del_cols = [] #['FLAG_DISQUALIFICATION', 'SUM_95_EVER', 'F1400', 'F1410']

In [12]:
feat_cols = [col for col in train.columns if col not in (['id_contract', 'TARGET'] + dt + del_cols)]

In [13]:
feat_cols.__len__()

139

In [14]:
train[feat_cols].dtypes

id_client                   int64
IP_flag                     int64
F1100                     float64
F1110                     float64
F1150                     float64
                           ...   
datefirstreg_month_sin    float64
SIGN_DATE_num               int64
DATEFIRSTREG_num           object
TAXREG_REGDATE_num         object
TAXREGPAY_REGDATE_num      object
Length: 139, dtype: object

In [15]:
model = CatBoostClassifier(cat_features=ct.to_list(), objective='Logloss')
#model = CatBoostClassifier(objective = 'Logloss')

In [16]:
model.fit(train[feat_cols], train['TARGET'])

Learning rate set to 0.035303
0:	learn: 0.6665038	total: 75.8ms	remaining: 1m 15s
1:	learn: 0.6378663	total: 105ms	remaining: 52.4s
2:	learn: 0.6133330	total: 137ms	remaining: 45.5s
3:	learn: 0.5904185	total: 170ms	remaining: 42.4s
4:	learn: 0.5693863	total: 203ms	remaining: 40.4s
5:	learn: 0.5494758	total: 229ms	remaining: 38s
6:	learn: 0.5264259	total: 263ms	remaining: 37.3s
7:	learn: 0.5108703	total: 295ms	remaining: 36.6s
8:	learn: 0.4975158	total: 326ms	remaining: 35.9s
9:	learn: 0.4823913	total: 356ms	remaining: 35.3s
10:	learn: 0.4668933	total: 389ms	remaining: 34.9s
11:	learn: 0.4555997	total: 418ms	remaining: 34.4s
12:	learn: 0.4473148	total: 451ms	remaining: 34.2s
13:	learn: 0.4369534	total: 476ms	remaining: 33.5s
14:	learn: 0.4277872	total: 505ms	remaining: 33.2s
15:	learn: 0.4211150	total: 533ms	remaining: 32.8s
16:	learn: 0.4116288	total: 565ms	remaining: 32.7s
17:	learn: 0.4037202	total: 592ms	remaining: 32.3s
18:	learn: 0.3960376	total: 629ms	remaining: 32.5s
19:	learn: 

164:	learn: 0.2308444	total: 5.46s	remaining: 27.7s
165:	learn: 0.2305712	total: 5.5s	remaining: 27.6s
166:	learn: 0.2303187	total: 5.54s	remaining: 27.6s
167:	learn: 0.2299925	total: 5.57s	remaining: 27.6s
168:	learn: 0.2298107	total: 5.59s	remaining: 27.5s
169:	learn: 0.2296588	total: 5.63s	remaining: 27.5s
170:	learn: 0.2292207	total: 5.66s	remaining: 27.4s
171:	learn: 0.2289000	total: 5.7s	remaining: 27.4s
172:	learn: 0.2287251	total: 5.73s	remaining: 27.4s
173:	learn: 0.2284023	total: 5.76s	remaining: 27.3s
174:	learn: 0.2282611	total: 5.8s	remaining: 27.3s
175:	learn: 0.2280940	total: 5.83s	remaining: 27.3s
176:	learn: 0.2278520	total: 5.86s	remaining: 27.3s
177:	learn: 0.2275691	total: 5.9s	remaining: 27.2s
178:	learn: 0.2273538	total: 5.94s	remaining: 27.2s
179:	learn: 0.2271518	total: 5.98s	remaining: 27.3s
180:	learn: 0.2270732	total: 6.02s	remaining: 27.3s
181:	learn: 0.2268706	total: 6.06s	remaining: 27.2s
182:	learn: 0.2266150	total: 6.11s	remaining: 27.3s
183:	learn: 0.22

323:	learn: 0.2037369	total: 11s	remaining: 22.9s
324:	learn: 0.2035533	total: 11s	remaining: 22.9s
325:	learn: 0.2034648	total: 11.1s	remaining: 22.9s
326:	learn: 0.2033779	total: 11.1s	remaining: 22.8s
327:	learn: 0.2032150	total: 11.1s	remaining: 22.8s
328:	learn: 0.2031525	total: 11.2s	remaining: 22.8s
329:	learn: 0.2030628	total: 11.2s	remaining: 22.8s
330:	learn: 0.2028499	total: 11.2s	remaining: 22.7s
331:	learn: 0.2026995	total: 11.3s	remaining: 22.7s
332:	learn: 0.2026339	total: 11.3s	remaining: 22.7s
333:	learn: 0.2025177	total: 11.3s	remaining: 22.6s
334:	learn: 0.2023882	total: 11.4s	remaining: 22.6s
335:	learn: 0.2022218	total: 11.4s	remaining: 22.6s
336:	learn: 0.2021238	total: 11.4s	remaining: 22.5s
337:	learn: 0.2020174	total: 11.5s	remaining: 22.5s
338:	learn: 0.2019659	total: 11.5s	remaining: 22.4s
339:	learn: 0.2017889	total: 11.5s	remaining: 22.4s
340:	learn: 0.2016472	total: 11.6s	remaining: 22.4s
341:	learn: 0.2015662	total: 11.6s	remaining: 22.4s
342:	learn: 0.20

487:	learn: 0.1840499	total: 16.6s	remaining: 17.4s
488:	learn: 0.1838809	total: 16.6s	remaining: 17.4s
489:	learn: 0.1837326	total: 16.7s	remaining: 17.4s
490:	learn: 0.1836408	total: 16.7s	remaining: 17.3s
491:	learn: 0.1835293	total: 16.7s	remaining: 17.3s
492:	learn: 0.1834081	total: 16.8s	remaining: 17.3s
493:	learn: 0.1832353	total: 16.8s	remaining: 17.2s
494:	learn: 0.1831575	total: 16.8s	remaining: 17.2s
495:	learn: 0.1830864	total: 16.9s	remaining: 17.2s
496:	learn: 0.1829620	total: 16.9s	remaining: 17.1s
497:	learn: 0.1828345	total: 17s	remaining: 17.1s
498:	learn: 0.1826872	total: 17s	remaining: 17.1s
499:	learn: 0.1826141	total: 17s	remaining: 17s
500:	learn: 0.1824976	total: 17s	remaining: 17s
501:	learn: 0.1824052	total: 17.1s	remaining: 16.9s
502:	learn: 0.1823723	total: 17.1s	remaining: 16.9s
503:	learn: 0.1823141	total: 17.1s	remaining: 16.9s
504:	learn: 0.1822295	total: 17.2s	remaining: 16.8s
505:	learn: 0.1821599	total: 17.2s	remaining: 16.8s
506:	learn: 0.1820334	to

651:	learn: 0.1685640	total: 22.5s	remaining: 12s
652:	learn: 0.1685151	total: 22.5s	remaining: 12s
653:	learn: 0.1683566	total: 22.6s	remaining: 11.9s
654:	learn: 0.1682630	total: 22.6s	remaining: 11.9s
655:	learn: 0.1681738	total: 22.6s	remaining: 11.9s
656:	learn: 0.1680388	total: 22.6s	remaining: 11.8s
657:	learn: 0.1679374	total: 22.7s	remaining: 11.8s
658:	learn: 0.1678177	total: 22.7s	remaining: 11.7s
659:	learn: 0.1676910	total: 22.7s	remaining: 11.7s
660:	learn: 0.1676398	total: 22.8s	remaining: 11.7s
661:	learn: 0.1675082	total: 22.8s	remaining: 11.6s
662:	learn: 0.1674457	total: 22.8s	remaining: 11.6s
663:	learn: 0.1673491	total: 22.9s	remaining: 11.6s
664:	learn: 0.1672607	total: 22.9s	remaining: 11.5s
665:	learn: 0.1671502	total: 22.9s	remaining: 11.5s
666:	learn: 0.1670266	total: 23s	remaining: 11.5s
667:	learn: 0.1669807	total: 23s	remaining: 11.4s
668:	learn: 0.1669038	total: 23s	remaining: 11.4s
669:	learn: 0.1668080	total: 23.1s	remaining: 11.4s
670:	learn: 0.1667004	

814:	learn: 0.1558321	total: 28.2s	remaining: 6.41s
815:	learn: 0.1557568	total: 28.3s	remaining: 6.38s
816:	learn: 0.1557107	total: 28.3s	remaining: 6.34s
817:	learn: 0.1556011	total: 28.4s	remaining: 6.31s
818:	learn: 0.1554781	total: 28.4s	remaining: 6.27s
819:	learn: 0.1553484	total: 28.4s	remaining: 6.24s
820:	learn: 0.1553057	total: 28.5s	remaining: 6.21s
821:	learn: 0.1552428	total: 28.5s	remaining: 6.17s
822:	learn: 0.1551793	total: 28.5s	remaining: 6.14s
823:	learn: 0.1551173	total: 28.6s	remaining: 6.1s
824:	learn: 0.1550627	total: 28.6s	remaining: 6.07s
825:	learn: 0.1550249	total: 28.6s	remaining: 6.03s
826:	learn: 0.1549248	total: 28.7s	remaining: 6s
827:	learn: 0.1548610	total: 28.7s	remaining: 5.96s
828:	learn: 0.1547541	total: 28.7s	remaining: 5.93s
829:	learn: 0.1546536	total: 28.8s	remaining: 5.89s
830:	learn: 0.1546047	total: 28.8s	remaining: 5.86s
831:	learn: 0.1545820	total: 28.8s	remaining: 5.82s
832:	learn: 0.1545222	total: 28.9s	remaining: 5.79s
833:	learn: 0.15

975:	learn: 0.1461646	total: 33.9s	remaining: 833ms
976:	learn: 0.1460375	total: 33.9s	remaining: 799ms
977:	learn: 0.1459946	total: 34s	remaining: 764ms
978:	learn: 0.1459319	total: 34s	remaining: 729ms
979:	learn: 0.1458259	total: 34s	remaining: 695ms
980:	learn: 0.1457567	total: 34.1s	remaining: 660ms
981:	learn: 0.1457337	total: 34.1s	remaining: 625ms
982:	learn: 0.1456690	total: 34.1s	remaining: 590ms
983:	learn: 0.1456293	total: 34.2s	remaining: 556ms
984:	learn: 0.1455932	total: 34.2s	remaining: 521ms
985:	learn: 0.1455572	total: 34.2s	remaining: 486ms
986:	learn: 0.1455440	total: 34.3s	remaining: 452ms
987:	learn: 0.1454467	total: 34.3s	remaining: 417ms
988:	learn: 0.1453326	total: 34.4s	remaining: 382ms
989:	learn: 0.1452583	total: 34.4s	remaining: 347ms
990:	learn: 0.1451954	total: 34.4s	remaining: 313ms
991:	learn: 0.1451067	total: 34.5s	remaining: 278ms
992:	learn: 0.1450859	total: 34.5s	remaining: 243ms
993:	learn: 0.1450040	total: 34.5s	remaining: 208ms
994:	learn: 0.1449

<catboost.core.CatBoostClassifier at 0x7fea970dc640>

In [17]:
test['TARGET'] = model.predict_proba(test[feat_cols])[:, 1]

In [18]:
test[['id_contract', 'TARGET']].to_csv('submission.csv', sep=';', index=False)

## Нелегально, но дает около 0.002 буста на паблике

In [19]:
exemp = pd.DataFrame.copy(test)

In [20]:
exemp[['id_client', 'TARGET']]

Unnamed: 0,id_client,TARGET
0,3620,0.009926
1,4101,0.030266
2,9589,0.055482
3,11546,0.460837
4,12558,0.905740
...,...,...
7325,8128,0.431984
7326,1132,0.002827
7327,4932,0.216726
7328,537,0.339689


In [21]:
train = pd.read_csv("Datasets/train_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')

In [22]:
intersect = set(exemp['id_client']) & set(train['id_client'])

In [23]:
active_ids = (train.groupby('id_client')['id_contract'].count() > 5).keys()[(train.groupby('id_client')['id_contract'].count() > 5).values]

In [24]:
len(set(active_ids) & intersect)

145

In [25]:
their_values = train.groupby('id_client')['TARGET'].mean()[(train.groupby('id_client')['id_contract'].count() > 5)].values

In [26]:
their_values[(their_values > 0.95)] = 1
their_values[(their_values < 0.05)] = 0

In [27]:
kept_ones = (their_values == 1) | (their_values == 0)

In [28]:
active_ids = active_ids[kept_ones]
their_values = their_values[kept_ones]

In [29]:
active_ids.shape, their_values.shape

((280,), (280,))

In [30]:
sub_df = pd.DataFrame(np.array([active_ids, their_values]).T, columns = ['id_client', 'TARGET_new'])

In [31]:
exemp = exemp.merge(sub_df, how = 'left')

In [32]:
exemp.loc[~exemp['TARGET_new'].isna(), 'TARGET'] = exemp.loc[~exemp['TARGET_new'].isna(), 'TARGET_new']

In [33]:
exemp[['id_contract', 'TARGET']].to_csv("submission_v2.csv", sep = ';', index = False)