In [245]:
import numpy as np, pandas as pd
from collections import Counter
import datetime

In [246]:
# загрузка данных

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
labels = np.load('./labels.npy')

In [247]:
# возращаем формат datetime для дат
for df in [train, test]:
    df['date'] = df['date'].apply(lambda x: datetime.date(int(x[:4]), int(x[5:7]), int(x[8:])))
    df['time'] = df['time'].apply(lambda x: datetime.time(int(x[:2]), int(x[3:5]), int(x[6:])))

# самая первая дата в данных
oldest_date = min(set((train['date'].unique())).union(test['date'].unique()))

# все различные trx типы транзакций
trx_categories = list(set(train['trx_category'].unique()).union(test['trx_category']))

# знаки операций
trx_signs = {
    'POS': -1, 'DEPOSIT': 1, 'C2C_OUT': -1, 'C2C_IN': 1, 'WD_ATM_PARTNER': -1, 'WD_ATM_ROS': -1, 
    'BACK_TRX': 1, 'WD_ATM_OTHER': -1, 'CASH_ADV': -1, 'CAT': -1
}

In [248]:
# метод cashflow представляет юзера в виде удобного для анализа словаря
from rosbank import cashflow

In [249]:
# отдельная группа дат обрабатывается следующим образом
def get_stats(dates):
    if len(dates) == 0:
        return [0] * 11
    else:
        features = []

        time = [(date - oldest_date).days/360 for date in dates]
        features += [time[-1] - time[0], np.std(time), np.median(time), np.mean(time), np.max(time)]

        if len(dates) > 1:
            diff = [(time[i+1] - time[i]) for i in np.arange(len(dates)-1)]
            features += [np.mean(diff), np.max(diff), np.min(diff), np.std(diff)]
            features += list(np.percentile(diff, [50, 95]))
        else:
            features += [0] * 6
    
        return features

In [250]:
# данная функция применяет get_stats к нескольким группам дат (по типу транзакций, по типу знака транзакции, 
# и всё вместе)
def analyze(user):
    history = cashflow(user)
    
    trx_dates = dict((cat, []) for cat in trx_categories)
    signed_dates = {1: [], -1: []}
    
    for date in history:
        for trx_category in history[date]:
            trx_dates[trx_category].append(date)
            signed_dates[trx_signs[trx_category]].append(date)
          
    features = get_stats(list(history))
    for curr_dates in [list(history)] + [trx_dates[cat] 
                                         for cat in trx_categories] + [signed_dates[1], signed_dates[-1]]:
        features += get_stats(curr_dates)
        if len(curr_dates) == 0.:
            features += [0] * 20
        
        else:
            last_calendar_month = np.max(curr_dates).month
            features += get_stats([date for date in curr_dates if date.month == last_calendar_month])
            
            months = list(Counter([day.month for day in curr_dates]).values())
            features += [np.mean(months), np.median(months), np.max(months), np.min(months), np.std(months)]
            
            # категориальный признак, показывающий в какую четверть года попадает последний календарный месяц
            if last_calendar_month in [1, 2, 3]:
                features += [1, 0, 0, 0]
            elif last_calendar_month in [4, 5, 6]:
                features += [0, 1, 0, 0]
            elif last_calendar_month in [7, 8, 9]:
                features += [0, 0, 1, 0]
            else:
                features += [0, 0, 0, 1]
    return features

In [256]:
# непосредственное применение функций
data = pd.DataFrame([el for el in train.groupby('cl_id', sort = False).apply(analyze).values])

In [253]:
# кросс-валидация

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss

kfold = StratifiedKFold(n_splits = 10, random_state = 2, shuffle = True)
splits = [el for el in kfold.split(np.zeros(len(labels)), labels)]

In [254]:
def get_score(data):
    scores = []
    for i in np.arange(10):
        tr, te = splits[i]
        train_X, train_y = data.values[tr], labels[tr]
        test_X, test_y = data.values[te], labels[te]
        gbm = LGBMClassifier(n_estimators = 50000, learning_rate = 0.1, subsample = 0.5,
                             colsample_bytree=0.8)
        gbm.fit(train_X, train_y, eval_set = (test_X, test_y), early_stopping_rounds = 300, 
                verbose = False)
        pred = gbm.predict_proba(test_X)[:, 1]
        scores.append(roc_auc_score(y_score = pred, y_true = test_y))
        print(i + 1, "iteration:", scores[-1])
    print("CV score:", np.mean(scores))

In [257]:
# для улучшения качества следует поиграться с параметрами (типа learning rate)
# также некоторые признаки из get_stats точно можно отбросить + напридумывать других делений на даты
# например, будни и выходые
get_score(data)

1 iteration: 0.8419127127385251
2 iteration: 0.8508412841670963
3 iteration: 0.8825103145951521
4 iteration: 0.8563403538877467
5 iteration: 0.8614722118793607
6 iteration: 0.8353515403668388
7 iteration: 0.833093199073999
8 iteration: 0.8804997725352571
9 iteration: 0.8342919347501139
10 iteration: 0.8413595892636639
CV score: 0.8517672913257753
