In [1]:
import numpy as np, pandas as pd
from collections import Counter
import scipy.stats as sts
import datetime

In [4]:
# загрузка данных

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
labels = np.load('./labels.npy')

In [5]:
# возращаем формат datetime для дат
for df in [train, test]:
    df['date'] = df['date'].apply(lambda x: datetime.date(int(x[:4]), int(x[5:7]), int(x[8:])))
    df['time'] = df['time'].apply(lambda x: datetime.time(int(x[:2]), int(x[3:5]), int(x[6:])))

# самая первая дата в данных
oldest_date = min(set((train['date'].unique())).union(test['date'].unique()))

# все различные trx типы транзакций
trx_categories = list(set(train['trx_category'].unique()).union(test['trx_category']))

# знаки операций
trx_signs = {
    'POS': -1, 'DEPOSIT': 1, 'C2C_OUT': -1, 'C2C_IN': 1, 'WD_ATM_PARTNER': -1, 'WD_ATM_ROS': -1, 
    'BACK_TRX': 1, 'WD_ATM_OTHER': -1, 'CASH_ADV': -1, 'CAT': -1
}

In [6]:
# метод cashflow представляет юзера в виде удобного для анализа словаря
from rosbank import cashflow

In [36]:


# проверка монотонности
def monotone(array):
    if len(array) <= 1:
        return 1.
    else:
        return np.mean([array[i+1] > array[i] for i in np.arange(len(array)-1)])
    
# анализ массива трат как неупорядоченного множества
def analyze_amounts(amounts):
    if len(amounts) == 0.:
        return [0] * 14
    else:
        features = []
        log_amounts = [np.log(amount) for amount in amounts]
        features += [np.min(log_amounts), np.max(log_amounts), np.max(log_amounts) - np.min(log_amounts), 
                     np.log(np.sum(amounts)), np.sum(log_amounts), np.std(log_amounts), np.std(amounts), 
                     sts.skew(amounts)]
        features += list(np.percentile(amounts, [5, 25, 50, 75, 80, 95]))
        return features

# анализ массива трат с учетом порядка операций
def analyze_srt_amounts(amounts):
    features = []
    if len(amounts) == 0:
        return [0] * 13
    log_amounts = [np.log(el) for el in amounts]
    features += [log_amounts[0], log_amounts[-1], monotone(amounts)]
    
    if len(amounts) > 1:
        diff = [log_amounts[i+1] - log_amounts[i] for i in np.arange(len(amounts) - 1)]
        features += [np.max(diff), np.mean(diff)]
        features += list(np.percentile(diff, [5, 25, 50, 75, 80, 95]))
        features += [monotone(diff)]
    else:
        features += [0] * 9
        
    for i in np.arange(1, len(amounts) + 1):
        s1 = np.sum(amounts[:i])
        s2 = np.sum(amounts[i:])
        if s1 >= s2:
            features += [i/len(amounts)]
            break
    return features

In [37]:
# анализирует траты для различных групп (аналогично времени)
def get_amounts(history):

    
    trx_amounts = dict((cat, []) for cat in trx_categories)
    amounts = []
    signed_amounts = []

    pos_amounts, neg_amounts = [], []
        
    for date in history:
        for trx_category in history[date]:
            # количество денег, фигурирующее за день по тому или иному типу транзакции
            amount = np.sum([np.sum(history[date][trx_category][mcc]) for mcc in history[date][trx_category]])
            amounts += [amount]
            trx_amounts[trx_category] += [amount]
            signed_amounts += [trx_signs[trx_category] * amount]
                      
            if trx_signs[trx_category] == 1:
                pos_amounts += [amount]
            else:
                neg_amounts += [amount]
            
    features = []   
    
    for arr in [amounts, pos_amounts, neg_amounts] + [trx_amounts[cat] for cat in trx_categories]:
        features += analyze_amounts(arr)
        features += analyze_srt_amounts(arr)
    
    account = np.array([sum(signed_amounts[:i]) for i in np.arange(1, len(signed_amounts) + 1)])
    account = 1 + account - np.min(account)
    features += analyze_srt_amounts(account)
    return features


def analyze(user):
    features = []
    
    history = cashflow(user)
    # денежные характеристики по всей истории
    features += get_amounts(history)
        
    # денежные характеристики за последний месяц
    last_calendar_month = np.max(user['date'].values).month
    last_month_history = cashflow(user[user['date'].apply(lambda x: x.month == last_calendar_month)])
    features += get_amounts(last_month_history)
    
    #for currency in ['RUB', 'USD', 'EUR']:
       # features += analyze_amounts(sorted(user[user['currency'] == currency]['amount'].values))
    return features

In [38]:
# непосредственное применение функций
data = pd.DataFrame([el for el in train.groupby('cl_id', sort = False).apply(analyze).values])

In [39]:
data -= np.mean(data, axis = 0)
data /= np.std(data, axis = 0)

In [40]:
# кросс-валидация

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss

kfold = StratifiedKFold(n_splits = 10, random_state = 2, shuffle = True)
splits = [el for el in kfold.split(np.zeros(len(labels)), labels)]

In [41]:
def get_score(data):
    scores = []
    for i in np.arange(10):
        tr, te = splits[i]
        train_X, train_y = data.values[tr], labels[tr]
        test_X, test_y = data.values[te], labels[te]
        gbm = LGBMClassifier(n_estimators = 15000, learning_rate = 0.1, subsample = 0.5,
                             colsample_bytree=0.8)
        gbm.fit(train_X, train_y, eval_set = (test_X, test_y), early_stopping_rounds = 300, 
                verbose = False)
        pred = gbm.predict_proba(test_X)[:, 1]
        scores.append(roc_auc_score(y_score = pred, y_true = test_y))
        #print(i + 1, "iteration:", scores[-1])
    print("CV score:", np.mean(scores))

In [42]:
# для улучшения качества следует поиграться с параметрами (типа learning rate)
get_score(data)

CV score: 0.8303170613715395
