In [2]:
import numpy as np, pandas as pd
from collections import Counter
import datetime
from rosbank import cashflow

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
labels = np.load('./labels.npy')
for df in [train, test]:
    df['date'] = df['date'].apply(lambda x: datetime.date(int(x[:4]), int(x[5:7]), int(x[8:])))
    df['time'] = df['time'].apply(lambda x: datetime.time(int(x[:2]), int(x[3:5]), int(x[6:])))

In [3]:
trx_categories = list(set(train['trx_category'].unique()))
category_signs = {
    'POS': -1, 'DEPOSIT': 1, 'C2C_OUT': -1, 'C2C_IN': 1, 'WD_ATM_PARTNER': -1, 'WD_ATM_ROS': -1, 
    'BACK_TRX': 1, 'WD_ATM_OTHER': -1, 'CASH_ADV': -1, 'CAT': -1
}
oldest_date = np.min(train['date'].values)

In [5]:
mcc_descr = np.load("./mcc_codes.npy").item()
mcc_groups = list(set([mcc_descr[el]['group'] for el in mcc_descr]))

hc_groups = np.load("./handcrafted_mcc_groups.npy").item()
hc_descr = {}
for group in hc_groups:
    for el in hc_groups[group]:
        hc_descr[el] = group
for el in [el for el in mcc_descr if not el in hc_descr]:
    hc_descr[el] = 'other'
    
hc_groups = list(set([hc_descr[el] for el in hc_descr]))

In [6]:
def analyze_dates(dates):
    features = []
    if len(dates) == 0.:
        features += [0] * 4
    else:
        features += [(np.max(dates) - np.min(dates)).days/365]
        
        time = [(date - oldest_date).days/360 for date in dates]
        features += [np.min(time), np.max(time), np.std(time)]
    if len(dates) > 1.:
        diff = [(dates[i+1] - dates[i]).days/365 for i in np.arange(len(dates)-1)]
        features += [np.max(diff), np.std(diff)]
    else:
        features += [0] * 2
    dates = sorted(list(set(dates)))
    if len(dates) > 1:
        diff = [(dates[i+1] - dates[i]).days/365 for i in np.arange(len(dates)-1)]
        features += [np.mean(diff), np.median(diff)]
    else:
        features += [0] * 2
    return features

In [7]:
def analyze_history(user):
    if user.shape[0] == 0.:
        return [0] * ((len(hc_groups) + len(trx_categories) + 1) * 8)
    history = cashflow(user)
    features = []
    
    ########
    hc_dates = {}
    trx_dates = {}
    dates = []
    for name in hc_groups:
        hc_dates[name] = []
    for name in trx_categories:
        trx_dates[name] = []
    ########  
    for date in history:
        for trx_category in history[date]:
            trx_sign = category_signs[trx_category]
            for mcc_category in history[date][trx_category]:
                hc_dates[hc_descr[mcc_category]] += [date]
                dates += [date]
                trx_dates[trx_category] += [date]
                #mcc_dates[mcc_descr[mcc_category]['group']] += [date]
    ########
    for name in hc_groups:
        features += analyze_dates(hc_dates[name])
    for name in trx_categories:
        features += analyze_dates(trx_dates[name])
    #for name in mcc_groups:
    #    features += analyze_dates(mcc_dates[name])
    for arr in [dates]:
        features += analyze_dates(dates)
    return features

In [8]:
def analyze(user):
    features = []
    features += analyze_history(user)
    last_date = np.max(user['date'].values)
    for n in [31]:
        features += analyze_history(user[user['date'].apply(lambda x: (last_date - x).days <= n)])
        
    dates = sorted(user[user['trx_category'] == 'POS']['date'].values)
    months = [date.month for date in dates]
    diff_months = []
    for month in months:
        if not month in diff_months:
            diff_months.append(month)
    
    features += [len(diff_months)]

    if len(months) > 0:
        if diff_months[0] in [1, 2, 3]:
            features += [1, 0, 0, 0]
        elif diff_months[0] in [4, 5, 6]:
            features += [0, 1, 0, 0]
        elif diff_months[0] in [7, 8, 9]:
            features += [0, 0, 1, 0]
        else:
            features += [0, 0, 0, 1]
    else:
        features += [0] * 4
    return features

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import RepeatedStratifiedKFold
skf = RepeatedStratifiedKFold(n_splits=8, n_repeats=2, random_state=201805)
import lightgbm as lgb

In [9]:
data = pd.DataFrame(data = [el for el in train.groupby('cl_id', sort = False).apply(analyze)])
data = (data - np.mean(data, axis = 0))/np.std(data, axis = 0)
lgb_data_train = lgb.Dataset(data, labels, free_raw_data=False)
params = {
    'objective':'binary', 'metric': 'auc', 'learning_rate': 0.01,  'random_state':4242442,
    'subsample':0.33,
    'class_weight':'balanced',
    'colsample_bytree':0.33,
    #'reg_lambda':4
}



h = lgb.cv(params, lgb_data_train,  num_boost_round=10000, 
           early_stopping_rounds=50, verbose_eval=50, folds=skf.split(data, labels))

print(list(h.values())[0][-1])

[50]	cv_agg's auc: 0.847157 + 0.0115189
[100]	cv_agg's auc: 0.849555 + 0.0113101
[150]	cv_agg's auc: 0.850872 + 0.0110204
[200]	cv_agg's auc: 0.852441 + 0.0108217
[250]	cv_agg's auc: 0.853596 + 0.0109287
[300]	cv_agg's auc: 0.854486 + 0.0108069
[350]	cv_agg's auc: 0.855023 + 0.0107556
[400]	cv_agg's auc: 0.855316 + 0.0110262
[450]	cv_agg's auc: 0.855778 + 0.0110777
[500]	cv_agg's auc: 0.855776 + 0.0107906
0.8558782010184699
