In [1]:
import numpy as np, pandas as pd
from collections import Counter
from tqdm import tqdm_notebook
import datetime

from rosbank import cashflow

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
labels = np.load('./labels.npy')
for df in [train, test]:
    df['date'] = df['date'].apply(lambda x: datetime.date(int(x[:4]), int(x[5:7]), int(x[8:])))
    df['time'] = df['time'].apply(lambda x: datetime.time(int(x[:2]), int(x[3:5]), int(x[6:])))

In [3]:
trx_categories = list(set(train['trx_category'].unique()))

In [4]:
mcc_descr = np.load("./mcc_codes.npy").item()
mcc_groups = list(set([mcc_descr[el]['group'] for el in mcc_descr]))

hc_groups = np.load("./handcrafted_mcc_groups.npy").item()
hc_descr = {}
for group in hc_groups:
    for el in hc_groups[group]:
        hc_descr[el] = group
for el in [el for el in mcc_descr if not el in hc_descr]:
    hc_descr[el] = 'other'
    
hc_groups = list(set([hc_descr[el] for el in hc_descr]))

In [5]:
category_signs = {
    'POS': -1, 'DEPOSIT': 1, 'C2C_OUT': -1, 'C2C_IN': 1, 'WD_ATM_PARTNER': -1, 'WD_ATM_ROS': -1, 
    'BACK_TRX': 1, 'WD_ATM_OTHER': -1, 'CASH_ADV': -1, 'CAT': -1
}

In [6]:
def monotone(array):
    if len(array) <= 1:
        return 1.
    else:
        return np.mean([array[i+1] > array[i] for i in np.arange(len(array)-1)])
    
def analyze_srt_amounts(amounts):
    features = []
    if len(amounts) == 0:
        return [0] * 11
    log_amounts = [np.log(el) for el in amounts]
    features += [monotone(amounts)]
    
    if len(amounts) > 1:
        diff = [log_amounts[i+1] - log_amounts[i] for i in np.arange(len(amounts) - 1)]
        features += [np.max(diff), np.mean(diff)]
        features += list(np.percentile(diff, [5, 25, 50, 75, 80, 95]))
        features += [monotone(diff)]
    else:
        features += [0] * 9
        
    for i in np.arange(1, len(amounts) + 1):
        s1 = np.sum(amounts[:i])
        s2 = np.sum(amounts[i:])
        if s1 >= s2:
            features += [i/len(amounts)]
            break
    return features

def analyze_amounts(amounts):
    features = []
    n = len(amounts)
    if n == 0.:
        features += [0] * 7
    else:
        features += [n, np.sum(amounts), np.mean(amounts), np.max(amounts), np.median(amounts),
               np.std(amounts), np.min(amounts)]
    return features

In [7]:
def analyze_cash(user):
    features = []
    history = cashflow(user)
    ###########
    signed_amounts = []
    pos_amounts = []
    hc_amounts = {}
    for name in hc_groups:
        hc_amounts[name] = []
    trx_amounts = {}
    for name in trx_categories:
        trx_amounts[name] = []
    ###########  
    for date in history:
        for trx_category in history[date]:
            trx_sign = category_signs[trx_category]
            trx_value = 0
            for mcc_category in history[date][trx_category]:
                hc_group = hc_descr.get(mcc_category, 0.0)
                mcc_value = np.sum(history[date][trx_category][mcc_category])
                trx_value += mcc_value
                if hc_group != 0.:
                    hc_amounts[hc_group] += [mcc_value]
            trx_amounts[trx_category] += [trx_value]
            signed_amounts += [trx_sign * trx_value]
            if trx_sign == 1.:
                pos_amounts += [trx_value]
    ##########
    for name in hc_groups:       
        features += analyze_amounts(hc_amounts[name])
    for name in trx_categories:       
        features += analyze_amounts(trx_amounts[name])
    for arr in [signed_amounts, pos_amounts]:
        features += analyze_amounts(arr)
        
    account = np.array([sum(signed_amounts[:i]) for i in np.arange(1, len(signed_amounts) + 1)])
    if account[-1] < 0:
        features += [0]
    else:
        features += [1]
    account = 1. + account - np.min(account)
    features += analyze_srt_amounts(account)
    return features

In [8]:
def analyze(user):
    features = []
    features += analyze_cash(user)
    
    for currency in ['RUB', 'USD', 'EUR']:
        features += analyze_amounts(sorted(user[user['currency'] == currency]['amount'].values))
      
    last_date = np.max(user['date'])
    for n in [7, 31]:
        features += analyze_cash(user[user['date'].apply(lambda x: (last_date - x).days <= n)])
    first_date = np.min(user['date'])
    for n in [31]:
        features += analyze_cash(user[user['date'].apply(lambda x: (x - first_date).days <= n)])
    return features
    

In [10]:
from sklearn.model_selection import RepeatedStratifiedKFold
skf = RepeatedStratifiedKFold(n_splits=8, n_repeats=2, random_state=201805)
import lightgbm as lgb

In [11]:
data = pd.DataFrame(data = [el for el in train.groupby('cl_id', sort = False).apply(analyze)])
data = (data - np.mean(data, axis = 0))/np.std(data, axis = 0)
lgb_data_train = lgb.Dataset(data, labels, free_raw_data=False)
params = {
    'objective':'binary', 'metric': 'auc', 'learning_rate': 0.01,  'random_state':4242442,
    'subsample':0.33,
    'class_weight':'balanced',
    'colsample_bytree':0.33,
    'reg_lambda':4
}



h = lgb.cv(params, lgb_data_train,  num_boost_round=10000, 
           early_stopping_rounds=50, verbose_eval=50, folds=skf.split(data, labels))

print(list(h.values())[0][-1])

[50]	cv_agg's auc: 0.814598 + 0.00900034
[100]	cv_agg's auc: 0.81854 + 0.00889604
[150]	cv_agg's auc: 0.822116 + 0.00893029
[200]	cv_agg's auc: 0.82646 + 0.00911823
[250]	cv_agg's auc: 0.829447 + 0.0092842
[300]	cv_agg's auc: 0.831587 + 0.00941526
[350]	cv_agg's auc: 0.833269 + 0.00941938
[400]	cv_agg's auc: 0.834685 + 0.00958308
[450]	cv_agg's auc: 0.835517 + 0.00965495
[500]	cv_agg's auc: 0.836278 + 0.00943598
[550]	cv_agg's auc: 0.836853 + 0.00945099
[600]	cv_agg's auc: 0.837247 + 0.00931908
[650]	cv_agg's auc: 0.837651 + 0.00935247
[700]	cv_agg's auc: 0.838016 + 0.0093938
[750]	cv_agg's auc: 0.838297 + 0.00940402
[800]	cv_agg's auc: 0.83837 + 0.00928502
[850]	cv_agg's auc: 0.838434 + 0.00919136
[900]	cv_agg's auc: 0.838606 + 0.00913803
[950]	cv_agg's auc: 0.838685 + 0.00932019
[1000]	cv_agg's auc: 0.838748 + 0.00921612
[1050]	cv_agg's auc: 0.838856 + 0.00925641
0.8388973150627456
