In [475]:
import os 
import sys
import time
import datetime
import itertools
import collections

import numpy as np
import pandas as pd
import scipy as sc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

* __timestamp__ - дата и время совершения события

* __application_id__- идентификатор приложения

* __client__ - Идентификатор клиента 

* __session_id__ - Идентификатор сессии

* __event_type__ - Тип события

* __event_category__ - Категория события

* __event_name__ - Имя события

* __event_label__ - Дополнительный атрибут события
 
* __device_screen_name__ - Имя экрана на котором произошло событие

* __timezone__ - Часовой пояс

* __device_is_webview__ - Флаг того что страница открыта внутри webview
 
* __page_urlhost__ - Домен страницы

* __page_urlpath_full__ - Путь страницы

* __net_connection_type__ - Тип подключения

* __net_connection_tech__ - Технология подключения

In [476]:
train_part = pd.read_csv('alfabattle2_abattle_train_target.csv', parse_dates=['timestamp'])
train_part.sort_values(['client_pin', 'session_id', 'timestamp'], inplace=True)
train_part.reset_index(inplace=True, drop=True)
print(train_part.shape)
train_part.head(3)

(5065350, 4)


Unnamed: 0,session_id,client_pin,timestamp,multi_class_target
0,01e2bfc05dda08dd9ea3e881e45858cc,000033b6509acd1c8eb0d06ebd2e1de9,2020-09-04 16:23:23,main_screen
1,046e7a872bc29e8fd38fab3cd7bb3636,000033b6509acd1c8eb0d06ebd2e1de9,2020-07-27 19:07:38,main_screen
2,0512ce7ff813662409ca40acbd1d16df,000033b6509acd1c8eb0d06ebd2e1de9,2020-02-22 23:18:45,main_screen


In [477]:
test_part = pd.read_csv('alfabattle2_prediction_session_timestamp.csv', parse_dates=['timestamp'])
print(test_part.shape)
test_part.head(3)

(79268, 2)


Unnamed: 0,client_pin,timestamp
0,f0c674b2bb4dc64be607029271d706ec,2020-08-01 00:00:53
1,90725b54ce77576883813d87749df6bd,2020-08-01 00:02:57
2,eb0f82d74c7b7bd5eafbd5b5f8cb3e2a,2020-08-01 00:03:14


In [478]:
sample_sub = pd.read_csv('alfabattle2_abattle_sample_prediction.csv')
print(sample_sub.shape)
sample_sub.head(3)

(79268, 2)


Unnamed: 0,client_pin,prediction
0,f0c674b2bb4dc64be607029271d706ec,credit_info
1,90725b54ce77576883813d87749df6bd,credit_info
2,eb0f82d74c7b7bd5eafbd5b5f8cb3e2a,own_transfer


## Feature engineering with click dataset

In [479]:
zone = ['Europe', 'Asia', 'Africa', 'Australia', 'Pacific', 'Antarctica', 'America', 'Etc']

city = ['Moscow', 'Yekaterinburg', 'Krasnoyarsk', 'Samara', 'Minsk', 'Novosibirsk', 'Omsk', 'Vladivostok', 'Irkutsk',
        'Kaliningrad', 'Ho_Chi_Minh', 'Volgograd', 'Novokuznetsk', 'Karachi', 'Dubai', 'Yakutsk', 'Bangkok', 'Kolkata',
        'Shanghai', 'Baku', 'Magadan', 'Almaty', 'Saratov', 'Kiev', 'Yerevan', 'Addis_Ababa', 'Tbilisi', 'Jakarta', 'Kamchatka']

cats = ['Application Lifecycle', 'SingleStatement', 'Bottom Navigation', 'Main Screen', 'Widget Dashboard', 'Card To Card Transfer', 
        'Push', 'ResultScreen', 'All Payments', 'Bank_Offer', 'Chat', 'Investments', 'Recharge', 'Credit Info', 'ClientPhoneTransfer', 
        'Operations', 'user', 'ReverseCashBack', 'mobile', 'Advice']

In [480]:
for i in range(0, 10, 1):
    print(f'{i}-set, time: {(datetime.datetime.now() + datetime.timedelta(hours=3)).time()}')
    
    click = pd.read_parquet(f'part-0000{i}.parquet')
    click = click.loc[click.session_id.isin(train_part.session_id.unique())]
    click.drop(columns=['event_label', 'page_urlpath_full', 'device_screen_name', 
                        'event_name', 'net_connection_tech'],
               inplace=True)
    click.sort_values(['client', 'session_id', 'timestamp'], inplace=True)
    click.reset_index(inplace=True, drop=True)
    print(click.shape)

    locations = click.timezone.str.split(pat="/", n=1, expand=True)
    locations.loc[~ locations[0].isin(zone), 0] = 'Etc'
    locations.loc[~ locations[1].isin(city), 1] = 'Other'
    locations = pd.concat([click.loc[:, ['session_id']], locations], axis=1)

    locations = locations.drop_duplicates(subset='session_id', keep='last').reset_index(drop=True)
    locations.rename(columns={0: 'zone', 1: 'city'}, inplace=True)
    locations.head(3)

    click.drop(columns='timezone', inplace=True)


    click.loc[click.application_id.str.find('retail') != -1, 'application_id'] = 'retail'
    click.loc[click.application_id.isin(['ncl', 'passport_metrics']), 'application_id'] = 'mobile'

    applications = click.groupby('session_id').apply(lambda group: group.application_id.value_counts(normalize=True)
                                                   ).reset_index().pivot(index="session_id", columns="level_1", 
                                                                         values="application_id").reset_index().fillna(0)
    click.drop(columns='application_id', inplace=True)


    durations = click.groupby('session_id').apply(lambda group: group.timestamp.iloc[-1] - group.timestamp.iloc[0]
                                                 ).reset_index().rename(columns={0: 'duration'})
    durations.loc[:, 'duration'] = durations.duration.astype('timedelta64[ms]')
    click.drop(columns='timestamp', inplace=True)


    event_types = click.groupby('session_id').apply(lambda group: group.event_type.value_counts(normalize=True)
                                                   ).reset_index().pivot(index="session_id", columns="level_1", 
                                                                         values="event_type").reset_index().fillna(0)
    event_types.drop(columns='err', inplace=True)
    click.drop(columns='event_type', inplace=True)


    click.loc[~ click.event_category.isin(cats), 'event_category'] = np.nan
    event_categories = click.groupby('session_id').apply(lambda group: group.event_category.value_counts(normalize=True)
                                                        ).reset_index().pivot(index="session_id", columns="level_1", 
                                                                              values="event_category").reset_index().fillna(0)
    click.drop(columns='event_category', inplace=True)


    click.loc[:, 'online_alfa_ru'] = False
    click.loc[:, 'click_alfa_ru'] = False
    click.loc[:, 'anketa_alfa_ru'] = False
    click.loc[:, 'alfa_ru'] = False

    click.loc[click.page_urlhost == 'online.alfabank.ru', 'online_alfa_ru'] = True
    click.loc[click.page_urlhost == 'click.alfabank.ru', 'click_alfa_ru'] = True
    click.loc[click.page_urlhost == 'anketa.alfabank.ru', 'anketa_alfa_ru'] = True
    click.loc[click.page_urlhost == 'alfabank.ru', 'alfa_ru'] = True

    click.drop(columns='page_urlhost', inplace=True)


    click.fillna('offline', inplace=True)

    net_types = click.loc[:, ['session_id', 'net_connection_type']].drop_duplicates(subset='session_id', keep='last')
    click.drop(columns='net_connection_type', inplace=True)


    click.device_is_webview.fillna(False, inplace=True)
    boolings = click.groupby('session_id').apply(lambda group: group.loc[:, 'device_is_webview':'alfa_ru'].any(axis=0)
                                                ).reset_index()

    part = locations.merge(applications, on='session_id', how='outer'
                          ).merge(durations, on='session_id', how='outer'
                                 ).merge(event_types, on='session_id', how='outer'
                                        ).merge(event_categories, on='session_id', how='outer'
                                               ).merge(net_types, on='session_id', how='outer'
                                                      ).merge(boolings, on='session_id', how='outer')
    print(part.shape)
    print('--------------------------')
    print()
    
    part.to_pickle(f'part_{i}.pkl')

0-set, time: 01:58:58.166314
(8644341, 10)
(526700, 38)
--------------------------

1-set, time: 02:27:49.320097
(8430545, 10)
(516736, 38)
--------------------------

2-set, time: 02:56:22.373258
(7982539, 10)
(482861, 38)
--------------------------

3-set, time: 03:23:07.165477
(8485579, 10)
(516789, 38)
--------------------------

4-set, time: 03:51:45.742894
(8305089, 10)
(505136, 38)
--------------------------

5-set, time: 04:19:44.797431
(8176490, 10)
(499672, 38)
--------------------------

6-set, time: 04:47:44.941994
(8344834, 10)
(500210, 38)
--------------------------

7-set, time: 05:15:37.402025
(8571231, 10)
(519316, 38)
--------------------------

8-set, time: 05:44:33.248389
(8104973, 10)
(499397, 38)
--------------------------

9-set, time: 06:12:21.336276
(8147199, 10)
(498533, 38)
--------------------------



In [481]:
parts = pd.concat([pd.read_pickle(f'part_{i}.pkl') for i in range(0, 10, 1)], axis=0)
train_part = train_part.merge(parts, on='session_id', how='left')
train_part.to_pickle('big_train.pkl')