In [88]:
import os 
import sys
import time
import datetime
import dateutils
import collections
import itertools
from tqdm.notebook import tqdm, trange

import numpy as np
import pandas as pd
import scipy as sci
from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [89]:
matplotlib.rcParams['figure.figsize'] = (10, 7)
sns.set_style('whitegrid')

In [90]:
train_part = pd.read_csv('alfabattle2_abattle_train_target.csv', parse_dates=['timestamp'])
train_part.drop(columns=['session_id'], inplace=True)
print(train_part.shape)
train_part.head(3)

(5065350, 3)


Unnamed: 0,client_pin,timestamp,multi_class_target
0,7cf9221322a0e2fdefb1b998b8f2ab29,2020-06-15 14:01:12,main_screen
1,5f16c0ab27a806fd08db3122921adf3a,2020-03-21 12:59:34,invest
2,ec868fc2b388293cf10e18ee9518d72f,2020-01-24 18:18:55,statement


In [91]:
test_part = pd.read_csv('alfabattle2_prediction_session_timestamp.csv', parse_dates=['timestamp'])
test_part['multi_class_target'] = 'need_to_predict'
print(test_part.shape)
test_part.head(3)

(79268, 3)


Unnamed: 0,client_pin,timestamp,multi_class_target
0,f0c674b2bb4dc64be607029271d706ec,2020-08-01 00:00:53,need_to_predict
1,90725b54ce77576883813d87749df6bd,2020-08-01 00:02:57,need_to_predict
2,eb0f82d74c7b7bd5eafbd5b5f8cb3e2a,2020-08-01 00:03:14,need_to_predict


In [92]:
sample_sub = pd.read_csv('alfabattle2_abattle_sample_prediction.csv')
print(sample_sub.shape)
sample_sub.head(3)

(79268, 2)


Unnamed: 0,client_pin,prediction
0,f0c674b2bb4dc64be607029271d706ec,credit_info
1,90725b54ce77576883813d87749df6bd,credit_info
2,eb0f82d74c7b7bd5eafbd5b5f8cb3e2a,own_transfer


### EDA

In [93]:
train_part.client_pin.value_counts().describe(percentiles=[.05, .10, .15])

count    79268.000000
mean        63.901574
std         85.280433
min          1.000000
5%           3.000000
10%          5.000000
15%          8.000000
50%         35.000000
max       1646.000000
Name: client_pin, dtype: float64

In [94]:
train_part.multi_class_target.value_counts()

main_screen             2280763
statement                922569
credit_info              498698
own_transfer             290077
mobile_recharge          266485
phone_money_transfer     232911
card2card_transfer       193378
chat                     184775
card_recharge            138616
invest                    57078
Name: multi_class_target, dtype: int64

### Concatanation

In [95]:
train_part.multi_class_target.unique()

array(['main_screen', 'invest', 'statement', 'phone_money_transfer',
       'own_transfer', 'credit_info', 'chat', 'card2card_transfer',
       'mobile_recharge', 'card_recharge'], dtype=object)

In [96]:
Xy = pd.concat([train_part, test_part], axis=0).sort_values(['client_pin', 'timestamp'], ascending=[True, True])
Xy.reset_index(inplace=True, drop=True)
print(Xy.shape)
Xy.head(3)

(5144618, 3)


Unnamed: 0,client_pin,timestamp,multi_class_target
0,000033b6509acd1c8eb0d06ebd2e1de9,2020-02-02 22:31:04,statement
1,000033b6509acd1c8eb0d06ebd2e1de9,2020-02-19 18:37:26,main_screen
2,000033b6509acd1c8eb0d06ebd2e1de9,2020-02-22 23:18:45,main_screen


In [97]:
Xy.iloc[1, 1] < Xy.iloc[2, 1] # сравнение работает!

True

### Feature engineering

In [98]:
for k in trange(1, 9, 1):
    Xy.loc[:, f'target_lag_{k}'] = Xy.groupby(['client_pin'])['multi_class_target'].shift(k)
    Xy.loc[:, f'time_diff_{k}'] = Xy.groupby(['client_pin'])['timestamp'].diff(k).astype('timedelta64[s]')

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [99]:
%%time
Xy_cleaned = Xy.groupby('client_pin').apply(lambda group: group.iloc[8:, 1:] \
                                            if group.shape[0] >= 9 else group.iloc[:, 1:]
                                           ).reset_index().drop(columns=['level_1'])
print(Xy_cleaned.shape)
Xy_cleaned.head(3)

(4598578, 19)
CPU times: user 3min 50s, sys: 8.81 s, total: 3min 59s
Wall time: 4min 2s


Unnamed: 0,client_pin,timestamp,multi_class_target,target_lag_1,time_diff_1,target_lag_2,time_diff_2,target_lag_3,time_diff_3,target_lag_4,time_diff_4,target_lag_5,time_diff_5,target_lag_6,time_diff_6,target_lag_7,time_diff_7,target_lag_8,time_diff_8
0,000033b6509acd1c8eb0d06ebd2e1de9,2020-03-10 18:17:27,main_screen,main_screen,110837.0,main_screen,262795.0,main_screen,376670.0,main_screen,502662.0,main_screen,1039650.0,main_screen,1450722.0,main_screen,1726801.0,statement,3181583.0
1,000033b6509acd1c8eb0d06ebd2e1de9,2020-03-12 15:07:50,main_screen,main_screen,161423.0,main_screen,272260.0,main_screen,424218.0,main_screen,538093.0,main_screen,664085.0,main_screen,1201073.0,main_screen,1612145.0,main_screen,1888224.0
2,000033b6509acd1c8eb0d06ebd2e1de9,2020-03-12 17:19:50,main_screen,main_screen,7920.0,main_screen,169343.0,main_screen,280180.0,main_screen,432138.0,main_screen,546013.0,main_screen,672005.0,main_screen,1208993.0,main_screen,1620065.0


In [123]:
%%time
Xy_cleaned_train = Xy_cleaned.groupby('client_pin').apply(lambda group: group.iloc[-5:-1, 1:]).reset_index()
Xy_cleaned_test = Xy_cleaned.groupby('client_pin').apply(lambda group: group.iloc[-1, 1:]).reset_index()
Xy_cleaned_train.shape, Xy_cleaned_test.shape

CPU times: user 3min 59s, sys: 8.22 s, total: 4min 7s
Wall time: 4min 11s


((293217, 20), (79268, 19))

In [124]:
X_train, y_train = Xy_cleaned_train.drop(columns=['multi_class_target']), Xy_cleaned_train['multi_class_target']
X_test, y_test = Xy_cleaned_test.drop(columns=['multi_class_target']), Xy_cleaned_test['multi_class_target']

In [125]:
X_train.drop(columns=['timestamp', 'level_1'], inplace=True)
X_test.drop(columns=['timestamp'], inplace=True)

In [126]:
X_train.columns

Index(['client_pin', 'target_lag_1', 'time_diff_1', 'target_lag_2',
       'time_diff_2', 'target_lag_3', 'time_diff_3', 'target_lag_4',
       'time_diff_4', 'target_lag_5', 'time_diff_5', 'target_lag_6',
       'time_diff_6', 'target_lag_7', 'time_diff_7', 'target_lag_8',
       'time_diff_8'],
      dtype='object')

In [127]:
cat_features = ['client_pin'] + [f'target_lag_{k}' for k in range(1, 9)]
num_features = [f'time_diff_{k}' for k in range(1, 9)]

In [128]:
X_train[cat_features] = X_train[cat_features].fillna(value='nan_category')
X_test[cat_features] = X_test[cat_features].fillna(value='nan_category')

X_train[num_features] = X_train[num_features].fillna(value=25*10**6)
X_test[num_features] = X_test[num_features].fillna(value=25*10**6)

### Models

In [129]:
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier

In [132]:
model = CatBoostClassifier(allow_writing_files=False, verbose=5, cat_features=cat_features, 
                           n_estimators=105, eta=0.20)

model.fit(X_train, y_train)

0:	learn: 1.8826427	total: 4.14s	remaining: 7m 10s
5:	learn: 1.5130006	total: 26.4s	remaining: 7m 16s
10:	learn: 1.4338440	total: 51.3s	remaining: 7m 18s
15:	learn: 1.3946854	total: 1m 18s	remaining: 7m 15s
20:	learn: 1.3712596	total: 1m 45s	remaining: 7m
25:	learn: 1.3589457	total: 2m 12s	remaining: 6m 41s
30:	learn: 1.3518874	total: 2m 39s	remaining: 6m 19s
35:	learn: 1.3455812	total: 3m 5s	remaining: 5m 56s
40:	learn: 1.3407339	total: 3m 32s	remaining: 5m 32s
45:	learn: 1.3372817	total: 4m	remaining: 5m 7s
50:	learn: 1.3336234	total: 4m 26s	remaining: 4m 42s
55:	learn: 1.3305333	total: 4m 55s	remaining: 4m 18s
60:	learn: 1.3277326	total: 5m 28s	remaining: 3m 56s
65:	learn: 1.3253017	total: 5m 59s	remaining: 3m 32s
70:	learn: 1.3233199	total: 6m 27s	remaining: 3m 5s
75:	learn: 1.3207331	total: 6m 54s	remaining: 2m 38s
80:	learn: 1.3187215	total: 7m 22s	remaining: 2m 11s
85:	learn: 1.3170610	total: 7m 49s	remaining: 1m 43s
90:	learn: 1.3152348	total: 8m 16s	remaining: 1m 16s
95:	learn

<catboost.core.CatBoostClassifier at 0x7feeb02896d0>

In [328]:
# # сохраняем модель
# import joblib
# joblib.dump(
#     value=model,
#     filename='./tmp/catboost_model.pkl'
# );

In [133]:
print('F1 на обучении:', 
        round(f1_score(
            y_true=y_train,
            y_pred=model.predict(X_train),
            average='macro'
        ), 3)
     )

F1 на обучении: 0.427


In [138]:
sample_sub['client_pin'] = X_test.client_pin.values
sample_sub['prediction'] = model.predict(X_test)

In [139]:
sample_sub.to_csv('alfaboosters_1.csv', index=False)