In [1]:
import pandas as pd
import numpy as np
import catboost
import re
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import joblib

from tqdm._tqdm_notebook import tqdm_notebook

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
# Считываем данные
tr_mcc_codes = pd.read_csv('mcc_codes.csv', sep=';', index_col='mcc_code')
tr_types = pd.read_csv('trans_types.csv', sep=';', index_col='trans_type')

transactions = pd.read_csv('transactions.csv', index_col='client_id')
gender_train = pd.read_csv('train.csv', index_col='client_id')
gender_test = pd.read_csv('test.csv', index_col='client_id')
transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')
transactions_train = transactions_train.join(tr_mcc_codes, how='inner', on = 'mcc_code')
transactions_test = transactions_test.join(tr_mcc_codes, how='inner', on = 'mcc_code')

In [4]:
transactions_train.head()

Unnamed: 0_level_0,trans_time,mcc_code,trans_type,amount,term_id,trans_city,Unnamed: 0,gender,mcc_description
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0002cf30347684df542e1a931f356875,313 14:52:03,4829,2370,-2170.07,888990,Saint Petersburg,6806,0,Денежные переводы
0002cf30347684df542e1a931f356875,440 22:02:27,4829,2330,-2893.2,800926,Saint Petersburg,6806,0,Денежные переводы
0002cf30347684df542e1a931f356875,363 07:44:21,4829,2331,-5011.43,10298153,Saint Petersburg,6806,0,Денежные переводы
0002cf30347684df542e1a931f356875,379 11:37:57,4829,2331,-20761.51,980004,Saint Petersburg,6806,0,Денежные переводы
0002cf30347684df542e1a931f356875,376 18:16:56,4829,2330,-26032.85,800925,Saint Petersburg,6806,0,Денежные переводы


In [3]:
datasets = [transactions_train, transactions_test]

In [4]:
for dataset in datasets:
    dataset['earned'] = dataset['amount'] > 0
    dataset['day'] = int(dataset['trans_time'].str.split()[0][0])
    dataset['hour'] = int(dataset['trans_time'].str.split()[1][0].split(':')[0][0])

In [12]:
def group_by_id(x):
    features = []
    positive = x.loc[x['amount']>0]
    negative = x.loc[x['amount']<0]
    # print(positive.groupby([positive.index, positive['day']])['amount'].agg(['sum', 'std', 'count']).shape)
    features.append(x.groupby([x.index, x['day']])['amount'].agg(['sum', 'std', 'count']).add_prefix('tr_per_day_'))
    features.append(positive.groupby([positive.index, positive['day']])['amount'].agg(['sum', 'std', 'count']).add_prefix('tr_per_day_pos_'))
    features.append(negative.groupby([negative.index, negative['day']])['amount'].agg(['sum', 'std', 'count']).add_prefix('tr_per_day_neg_'))
    features.append(positive.groupby(positive.index)['amount'].agg(['sum', 'count'])\
                                                        .add_prefix('positive_transactions_'))
    features.append(negative.groupby(negative.index)['amount'].agg(['sum', 'count'])\
                                                        .add_prefix('negative_transactions_'))
    features.append(x.groupby(x.index)['amount'].agg(['sum', 'mean', 'std', 'count'])\
                                                        .add_prefix('all_transactions_'))                                                    
    features.append(x.groupby([x.index, 'mcc_code'])['amount'].agg(['sum', 'mean', 'std', 'count'])\
                                                        .add_prefix('mcc_transactions_'))
    features.append(x.groupby([x.index, 'trans_type'])['amount'].agg(['sum', 'mean', 'std', 'count'])\
                                                        .add_prefix('type_transactions_'))
    features.append(negative.groupby(negative.index)['amount'].agg('sum') / positive.groupby(positive.index)['amount'].agg('sum'))
    features.append(pd.get_dummies(data = x,
                                    columns = ['trans_type', 'mcc_code', 'trans_city'], drop_first=True))
    res = pd.concat(features)                                          
    return res.fillna(0, inplace = True)

In [13]:
for dataset in datasets:
    dataset = group_by_id(dataset)

In [16]:
complete_train = group_by_id(transactions_train)

In [17]:
complete_train.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [93]:
complete_test = group_by_id(transactions_test)

(772, 3)
                                    trans_time    amount   term_id  \
client_id                                                            
002218c52073faaf23bb26df2cffb138   54 10:35:16   -360.78         0   
002218c52073faaf23bb26df2cffb138   52 11:00:46  -1446.74         0   
002218c52073faaf23bb26df2cffb138   85 08:23:11  -1446.73         0   
002218c52073faaf23bb26df2cffb138  344 08:59:29    724.15  10117859   
002218c52073faaf23bb26df2cffb138  392 13:41:42    722.73  10117859   
...                                        ...       ...       ...   
e89f8505edcc1828967b118a4a128497  375 23:59:46   -739.43  28560136   
e89f8505edcc1828967b118a4a128497  158 00:00:32 -86230.51         0   
e89f8505edcc1828967b118a4a128497  375 00:00:59  -2546.02  PA03CC06   
e89f8505edcc1828967b118a4a128497  370 23:59:25  -8358.02  PA03CC02   
f11d7a67d566a5a94d8643023a4dc60e  339 11:29:31  -1481.92  500021KP   

                                  Unnamed: 0  \
client_id                       

In [91]:
complete_test.head()

Unnamed: 0,tr_per_day_sum,tr_per_day_std,tr_per_day_count,tr_per_day_pos_sum,tr_per_day_pos_std,tr_per_day_pos_count,tr_per_day_neg_sum,tr_per_day_neg_std,tr_per_day_neg_count,positive_transactions_sum,...,mcc_code_9402,trans_city_Kazan,trans_city_Khabarovsk,trans_city_Moscow,trans_city_Novosibirsk,trans_city_Penza,trans_city_Saint Petersburg,trans_city_Tver,trans_city_Vladimir,trans_city_Vladivostok
"(002218c52073faaf23bb26df2cffb138, 54)",-76968.86,1872.053451,124.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(0069d4050c7aeb341317e2e125e8fd2e, 54)",-170516.24,2211.220425,267.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(0083fb89aea18c408a5534cdd38d7692, 54)",-98003.78,9213.845832,355.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(014021fa559786c81e9e1d48d378aabb, 54)",-327453.37,2789.158555,553.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(01759f92152bbb3de79f9493a4a5cd6d, 54)",-51049.0,2602.015856,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
