# Базовое решение для задачи B

In [None]:
import pandas as pd
import numpy as np
import xgboost

SHIFT = 500

Читаем входные файлы с данными

In [None]:
transactions = pd.read_csv('../data/raw/transactions.csv')

Берём расходные транзакции и формируем тестовую выборку

In [None]:
train_transactions = transactions[transactions.amount < 0].copy()
train_transactions['day'] = train_transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

test_transactions = pd.DataFrame(columns=train_transactions.mcc_code.unique(), 
                                 index=np.arange(1, 31) + train_transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)
test_transactions.columns = ['mcc_code', 'day']

train_grid = pd.DataFrame(columns=train_transactions.mcc_code.unique(), 
                          index=train_transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)
train_grid.columns = ['mcc_code', 'day']

Добавляем признаки, относящиеся к дате.

In [None]:
for tr_table in [train_transactions, test_transactions, train_grid]:
    tr_table['week_num'] = tr_table['day'] // 7
    tr_table['week_day'] = tr_table['day'] % 7
    tr_table['month_num'] = tr_table['day'] // 30
    tr_table['month_day'] = tr_table['day'] % 30

train_transactions = \
    pd.merge(train_grid,
             train_transactions.groupby(['day', 'week_num', 'week_day', 'month_num', 'month_day', 'mcc_code'])[['amount']]\
                 .sum().reset_index(),
             how='left').fillna(0)

Для каждого mcc_code получим данные о сумме транзакий за каждый день. Используем для обучения только временные признаки.

In [None]:
for day_shift in [-1, 0, 1]:
    for month_shift in train_transactions.month_num.unique()[1:]:
        train_shift = train_transactions.copy()
        train_shift['month_num'] += month_shift
        train_shift['month_day'] += day_shift
        train_shift['amount_day_{}_{}'.format(day_shift, month_shift)] = np.log(-train_shift['amount'] + 1)
        train_shift = train_shift[['month_num', 'month_day', 'mcc_code', 'amount_day_{}_{}'.format(day_shift, month_shift)]]

        train_transactions = pd.merge(train_transactions, train_shift, 
                                      on=['month_num', 'month_day', 'mcc_code'], how='left').fillna(0)
        test_transactions = pd.merge(test_transactions, train_shift, 
                                     on=['month_num', 'month_day', 'mcc_code'], how='left').fillna(0)

Обучаемся на всех данных

In [None]:
train = pd.get_dummies(train_transactions, columns=['mcc_code'])
test = pd.get_dummies(test_transactions, columns=['mcc_code'])
c = train.columns.difference(['amount'])

In [None]:
dtrain = xgboost.DMatrix(train[c], label=np.log(-train['amount'] + SHIFT))
dtest = xgboost.DMatrix(test[c])

In [None]:
k = 500
param = { 
  'eta' : 0.2/float(k),
  'max_depth' : 5,
  'colsample_bytree' : 0.2,
  'min_child_weight' : 13,
  'gamma' : 14,
  'subsample' : 0.7,
  'objective' : 'reg:linear',
  'eval_metric' : "rmse"
}

clf = xgboost.train(param, dtrain, num_boost_round=100*k)

Делаем submit (в довесок сохраняем текущий блокнот)

In [None]:
import time
import datetime
import inspect
from shutil import copyfile

test_transactions['volume'] = np.e ** clf.predict(dtest) - SHIFT

ts = time.time()
ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

test_transactions[['mcc_code', 'day', 'volume']].to_csv('../data/submits/sbm_' + ts + '.csv', index=False)
copyfile('task2.ipynb', '../data/submits/sbm_' + ts + '.notebook.ipynb')