In [1]:
import pandas as pd
import numpy as np

import pickle
from collections import Counter

# import matplotlib.pyplot as plt
# import seaborn as sns
# import datetime
from catboost import CatBoostClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
PATH_DATA = './data'

# Читаем данные транзакций
transactions = pd.read_csv(os.path.join(PATH_DATA, 'transactions.csv'), header=0, index_col='client_id')
gender_train = pd.read_csv(os.path.join(PATH_DATA, 'train.csv'), header=0, index_col='client_id')
# gender_test = pd.read_csv(os.path.join(PATH_DATA, 'test.csv'), header=0, index_col='client_id')

# Парсинг времени
# transactions['trans_time'] = (
#     datetime.datetime(2020, 3, 8, 0, 0, 0)
#     - datetime.timedelta(219)
#     + pd.to_timedelta(transactions['trans_time'].str.replace(' ', ' days ', n=1))
# )

# Разбивка на train test
transactions_train = transactions.join(gender_train, how='inner').drop('Unnamed: 0', axis=1)
# transactions_test = transactions.join(gender_test, how='inner').drop('Unnamed: 0', axis=1)

tr_mcc_codes = pd.read_csv(os.path.join(PATH_DATA, 'mcc_codes.csv'), sep=";")
tr_types = pd.read_csv(os.path.join(PATH_DATA, 'trans_types.csv'), sep=";")

transactions_train = transactions_train.reset_index().merge(tr_mcc_codes, how='left', on='mcc_code')
transactions_train = transactions_train.merge(tr_types, how='left', on='trans_type').set_index('client_id')
mcc_prob = pd.read_csv('Another solve/inference/mcc_proba.csv')


In [3]:
transactions_train

Unnamed: 0_level_0,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender,mcc_description,trans_description
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0002cf30347684df542e1a931f356875,313 14:52:03,4829,2370,-2170.07,888990,Saint Petersburg,0,Денежные переводы,Списание с карты на карту по операции <перевод...
0002cf30347684df542e1a931f356875,150 14:10:49,6011,2010,-1445.64,,Saint Petersburg,0,Финансовые институты — снятие наличности автом...,Выдача наличных в АТМ Сбербанк России
0002cf30347684df542e1a931f356875,122 12:38:32,5912,1010,-107.07,,Saint Petersburg,0,Аптеки,Покупка. POS ТУ СБ РФ
0002cf30347684df542e1a931f356875,159 13:22:34,6011,2010,-2892.86,,Saint Petersburg,0,Финансовые институты — снятие наличности автом...,Выдача наличных в АТМ Сбербанк России
0002cf30347684df542e1a931f356875,257 12:06:54,5912,1010,-164.49,469965,Saint Petersburg,0,Аптеки,Покупка. POS ТУ СБ РФ
...,...,...,...,...,...,...,...,...,...
fffedf876a0ea3d39e54b706165a4826,28 12:51:47,6011,2010,-10846.43,,Saint Petersburg,1,Финансовые институты — снятие наличности автом...,Выдача наличных в АТМ Сбербанк России
fffedf876a0ea3d39e54b706165a4826,55 07:37:08,6011,2010,-2170.15,,Saint Petersburg,1,Финансовые институты — снятие наличности автом...,Выдача наличных в АТМ Сбербанк России
fffedf876a0ea3d39e54b706165a4826,55 07:40:10,4814,1030,-71.41,,Saint Petersburg,1,"Звонки с использованием телефонов, считывающих...",Оплата услуги. Банкоматы СБ РФ
fffedf876a0ea3d39e54b706165a4826,188 15:05:52,6011,2010,-2169.75,,Saint Petersburg,1,Финансовые институты — снятие наличности автом...,Выдача наличных в АТМ Сбербанк России


# Разбивка на train и test

In [4]:
dataset = transactions_train.copy()

In [5]:
def is_weekend(day):
    if day == 151:
        return 1
    elif day % 7 in [0, 6, 5]:
        return 1
    return 0
  

dataset["day"] = dataset["trans_time"].apply(lambda x: x.split()[0]).astype(int)
dataset["hours"] = dataset["trans_time"].apply(lambda x: x.split()[1].split(':')[0]).astype(int)
dataset["minute"] = dataset["trans_time"].apply(lambda x: x.split()[1].split(':')[1]).astype(int)
dataset["seconds"] = dataset["trans_time"].apply(lambda x: x.split()[1].split(':')[2]).astype(int)


dataset['is_dinner'] = dataset['hours'].apply(lambda x: 1 if 11 <= x <= 17 else 0)
dataset['is_evening'] = dataset['hours'].apply(lambda x: 1 if 23 <= x <= 24 else 0)
dataset['weekend'] = dataset['day'].apply(is_weekend) # is weekend transaction
dataset["day"] = dataset["day"] % 365 # day of the year


dataset["term_id"] = dataset['term_id'].fillna("-1")

dataset["mcc_code"] = dataset["mcc_code"].astype("category")
dataset["trans_type"] = dataset["trans_type"].astype("category")
dataset["term_id"] = dataset["term_id"].astype("str")
dataset["trans_city"] = dataset["trans_city"].astype("category")
dataset["trans_description"] = dataset["trans_description"].astype("category")
dataset["mcc_description"] = dataset["mcc_description"].astype("category")
dataset["day"] = dataset["day"].astype("category")
dataset["hours"] = dataset["hours"].astype("uint8")
dataset["minute"] = dataset["minute"].astype("uint8")
dataset["seconds"] = dataset["seconds"].astype("uint8")



def get_client_mcc_features(dataset, mcc_prob):
    # get probability that mcc_code is a male mcc
    # calculate most freq mcc 
    most_freq_mcc = dataset.groupby(['client_id'])['mcc_code'].apply(lambda x: x.mode().iloc[0]).reset_index().rename(columns={'mcc_code': 'most_freq_mcc_code'})


    # calculate weighted mcc sum
    client_mcc = dataset.groupby(['client_id', 'mcc_code']).agg({'mcc_description': 'count'}).reset_index().rename(columns={'mcc_description': 'mcc_amount'})
    client_total_tr = client_mcc.groupby(['client_id'])['mcc_amount'].sum().reset_index().rename(columns={'mcc_amount': 'mcc_total'})
    client_mcc = client_mcc.merge(client_total_tr, how='left', on='client_id')
    client_mcc['mcc_share'] = client_mcc['mcc_amount'] / client_mcc['mcc_total']
    client_mcc = client_mcc.merge(mcc_prob, how='left', on='mcc_code')
    client_mcc['1_prob'].fillna(0.5, inplace=True)
    client_mcc['weighted'] = client_mcc['mcc_share'] * client_mcc['1_prob'] 
    result = client_mcc.groupby(['client_id'])['weighted'].sum().reset_index().rename(columns={'weighted': 'weighted_mcc'})
    result = result.merge(most_freq_mcc, how='left', on='client_id')
  
    return result



dataset = dataset.merge(get_client_mcc_features(dataset, mcc_prob), on='client_id')


dataset = dataset.groupby('client_id').agg(
    neg_amount_count=('amount', lambda x: (x < 0).sum()),
    pos_amount_count=('amount', lambda x: (x > 0).sum()),
    pos_amount_sum=('amount', lambda x: x[x > 0].sum()),
    neg_amount_sum=('amount', lambda x: x[x < 0].sum()),
    amount_mean=('amount', 'mean'),
    amount_std=('amount', 'std'),
    amount_max=('amount', 'max'),
    amount_min=('amount', 'min'),
    amount_median=('amount', 'median'),
    minutes_mean=('minute', 'mean'),
    seconds_mean=('seconds', 'mean'),
    weighted_mcc=('weighted_mcc', 'mean'),
    tr_count=('client_id', 'count'),
    tr_weekend_count=('weekend', 'sum'),
    tr_dinner_count=('is_dinner', 'sum'),
    tr_evening_count=('is_evening', 'sum'),
    term_id_most_frequent=('term_id', lambda x: Counter(x).most_common()[0][0]),
    trans_city_most_frequent=('trans_city', lambda x: Counter(x).most_common()[0][0]),
)



# X = dataset.drop(columns=["term_id_most_frequent", 'trans_city_most_frequent'])

# y_proba = model.predict_proba(X)[:, 1]

# dataset_sample = pd.read_csv('./data/dataset_sample_submission.csv')
# dataset_sample["probability"] = y_proba

# dataset_sample.reset_index().drop(['Unnamed: 0', 'index'], axis=1).to_csv('result.csv')

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7560 entries, 0002cf30347684df542e1a931f356875 to fffedf876a0ea3d39e54b706165a4826
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   neg_amount_count          7560 non-null   int64  
 1   pos_amount_count          7560 non-null   int64  
 2   pos_amount_sum            7560 non-null   float64
 3   neg_amount_sum            7560 non-null   float64
 4   amount_mean               7560 non-null   float64
 5   amount_std                7554 non-null   float64
 6   amount_max                7560 non-null   float64
 7   amount_min                7560 non-null   float64
 8   amount_median             7560 non-null   float64
 9   minutes_mean              7560 non-null   float64
 10  seconds_mean              7560 non-null   float64
 11  weighted_mcc              7560 non-null   float64
 12  tr_count                  7560 non-null   int64  
 13  tr_weeken

In [7]:
dataset = dataset.join(gender_train, how='inner').drop('Unnamed: 0', axis=1)

In [9]:
dataset.describe(include='all')

Unnamed: 0,neg_amount_count,pos_amount_count,pos_amount_sum,neg_amount_sum,amount_mean,amount_std,amount_max,amount_min,amount_median,minutes_mean,seconds_mean,weighted_mcc,tr_count,tr_weekend_count,tr_dinner_count,tr_evening_count,term_id_most_frequent,trans_city_most_frequent,gender
count,7560.0,7560.0,7560.0,7560.0,7560.0,7554.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560,7560.0
unique,,,,,,,,,,,,,,,,,619.0,10,
top,,,,,,,,,,,,,,,,,-1.0,Moscow,
freq,,,,,,,,,,,,,,,,,6893.0,790,
mean,350.318915,78.083333,437668.3,-729429.5,-1059.394447,6061.914837,42767.02,-41434.83,-602.677562,29.410773,29.476888,0.451838,428.403704,198.381349,187.39709,29.474339,,,0.444577
std,308.778761,1040.766794,3333598.0,3530969.0,2736.233299,12866.871379,131267.6,108578.1,2965.296423,2.013158,1.562672,0.014548,1112.83852,554.55987,496.976988,51.044312,,,0.496952
min,0.0,0.0,0.0,-262661500.0,-68819.905882,21.109072,-16631.52,-4338732.0,-144623.505,1.0,8.0,0.374493,1.0,0.0,0.0,0.0,,,0.0
25%,133.75,7.0,21164.9,-603866.3,-1156.90135,1927.962836,5061.208,-36358.83,-391.0025,28.5,28.813905,0.444472,152.0,71.0,65.0,1.0,,,0.0
50%,275.0,25.0,93810.96,-303341.1,-520.201299,3405.479564,15547.01,-21694.14,-217.74,29.425913,29.497111,0.447363,311.0,143.0,134.0,10.0,,,0.0
75%,480.0,55.0,288260.6,-157854.1,-140.396328,6368.238756,39772.39,-11276.62,-132.47125,30.2995,30.157418,0.456099,538.0,246.0,234.0,37.0,,,1.0


In [10]:
razbivka = dataset.groupby(['gender', 'client_id']).count().reset_index().set_index('client_id').sample(n=2268, random_state=63)
train = dataset[~dataset.index.isin(razbivka.index)].reset_index().set_index('client_id')
test = dataset[dataset.index.isin(razbivka.index)].reset_index().set_index('client_id')

In [11]:
model = CatBoostClassifier(
    iterations=20000,
    random_seed=63,
    learning_rate=0.001,
    custom_loss='AUC',
    verbose=False
)
model.fit(
    train.drop('gender', axis=1), train.gender,
    cat_features=['term_id_most_frequent', 'trans_city_most_frequent'],
    eval_set=(test.drop('gender', axis=1), test.gender),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x17cccca90>

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os

In [8]:
PATH_DATA = './data'

# Читаем данные транзакций
transactions = pd.read_csv(os.path.join(PATH_DATA, 'transactions.csv'),
                           header=0,
                           index_col=False)
gender_train = pd.read_csv(os.path.join(PATH_DATA, 'train.csv'),
                           header=0,
                           index_col=0)
gender_test = pd.read_csv(os.path.join(PATH_DATA, 'test.csv'),
                          header=0,
                          index_col=0)

# Читаем данные полов
transactions_train = transactions.merge(gender_train, how='inner', on='client_id')
transactions_test = transactions.merge(gender_test, how='inner', on='client_id')

In [9]:
razbivka = transactions_train.groupby(['gender', 'client_id'], as_index=False).count()

razbivka2 = pd.concat([razbivka.query('gender == 1').sample(n=756)['client_id'], razbivka.query('gender == 0').sample(n=756)['client_id']], axis=0)
razbivka2
train = transactions_train[~transactions_train['client_id'].isin(razbivka2)]
test = transactions_train[transactions_train['client_id'].isin(razbivka2)]

print(train.head())
print(test.head())


                          client_id    trans_time  mcc_code  trans_type  \
0  d1bbbc9a0e0410d3cf12a3d2f44f3450   35 08:24:41      4829        2370   
1  d1bbbc9a0e0410d3cf12a3d2f44f3450  105 12:57:32      4829        2370   
2  d1bbbc9a0e0410d3cf12a3d2f44f3450  455 19:32:01      4814        1030   
3  d1bbbc9a0e0410d3cf12a3d2f44f3450   83 09:22:26      6011        2010   
4  d1bbbc9a0e0410d3cf12a3d2f44f3450   74 13:31:57      6011        2010   

    amount term_id trans_city  gender  
0 -1808.56     NaN       Tver       0  
1 -3390.41     NaN       Tver       0  
2  -144.50  889003       Tver       0  
3 -3542.30     NaN       Tver       0  
4 -3542.70     NaN       Tver       0  
                              client_id    trans_time  mcc_code  trans_type  \
14306  ec41f598a99167b68cb8e232f524017f  117 11:28:27      6011        2010   
14307  ec41f598a99167b68cb8e232f524017f  438 00:00:53      7993        1200   
14308  ec41f598a99167b68cb8e232f524017f  242 14:06:41      6011        2

In [49]:
train[['mcc_code', 'trans_type']].mean(axis=1)

0          3599.5
1          3599.5
2          2922.0
3          4010.5
4          4010.5
            ...  
3238726    6510.5
3238727    2922.0
3238729    4010.5
3238730    4010.5
3238731    3599.5
Length: 2591121, dtype: float64

In [13]:
label = train['gender']
train.drop('gender', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop('gender', axis=1, inplace=True)


In [10]:
def preprocessing_data(res: pd.DataFrame, data: pd.DataFrame) -> pd.DataFrame:
    # Формируем дату
    day_time = data['trans_time'].str.split(' ', n=1, expand=True)
    day_time.columns = ['day', 'time']
    day_time['day'] = day_time['day'].astype(int)

    # Стратовая дата
    start_date = datetime.datetime(2020, 3, 8, 0, 0, 0) - datetime.timedelta(219)

    # Замена времени в исходном датасете с гендерами
    trans_time = pd.Series(start_date + pd.to_timedelta(np.ceil(day_time['day']), unit="D"), name='trans_time')
    # trans_time.index = res['client_id']

    # trans_time.dt.month
    # trans_time.dt.day
    res['weekday'] = trans_time.dt.weekday
    # trans_time.dt.hour

    cat_mcc = res["mcc_code"]
    cat_mcc.index = res['client_id']
    cat_mcc.name = 'mcc_describe'

    a = cat_mcc.mask((724 <= cat_mcc) & (cat_mcc < 1799), 1) \
                .mask((1799 <= cat_mcc) & (cat_mcc < 2842) | (4900 <= cat_mcc) & (cat_mcc < 5200) | (5714 <= cat_mcc) & (cat_mcc < 5715) | (9702 <= cat_mcc) & (cat_mcc < 9752), 2) \
                .mask((2842 <= cat_mcc) & (cat_mcc < 3299), 3) \
                .mask((3299 <= cat_mcc) & (cat_mcc < 3441) | (7511 <= cat_mcc) & (cat_mcc < 7519), 4) \
                .mask((3441 <= cat_mcc) & (cat_mcc < 3882) | (6760 <= cat_mcc) & (cat_mcc < 7011), 5) \
                .mask((3882 <= cat_mcc) & (cat_mcc < 4789), 6) \
                .mask((4789 <= cat_mcc) & (cat_mcc < 4900), 7) \
                .mask((5200 <= cat_mcc) & (cat_mcc < 5499), 8) \
                .mask((5499 <= cat_mcc) & (cat_mcc < 5599) | (5699 <= cat_mcc) & (cat_mcc < 5714) | (5969 <= cat_mcc) & (cat_mcc < 5999), 9) \
                .mask((5599 <= cat_mcc) & (cat_mcc < 5699), 10) \
                .mask((5715 <= cat_mcc) & (cat_mcc < 5735) | (5811 <= cat_mcc) & (cat_mcc < 5950), 11) \
                .mask((5735 <= cat_mcc) & (cat_mcc < 5811) | (5999 <= cat_mcc) & (cat_mcc < 6760) | (5962 <= cat_mcc) & (cat_mcc < 5963) | (7011 <= cat_mcc) & (cat_mcc < 7033), 12) \
                .mask((5950 <= cat_mcc) & (cat_mcc < 5962) | (5963 <= cat_mcc) & (cat_mcc < 5969), 13) \
                .mask((7033 <= cat_mcc) & (cat_mcc < 7299), 14) \
                .mask((7299 <= cat_mcc) & (cat_mcc < 7511) | (7519 <= cat_mcc) & (cat_mcc < 7523), 15) \
                .mask((7523 <= cat_mcc) & (cat_mcc < 7699), 16) \
                .mask((7699 <= cat_mcc) & (cat_mcc < 7999), 17) \
                .mask((7999 <= cat_mcc) & (cat_mcc < 8351), 18) \
                .mask((8351 <= cat_mcc) & (cat_mcc < 8699), 19) \
                .mask((8699 <= cat_mcc) & (cat_mcc < 8999), 20) \
                .mask((8999 <= cat_mcc) & (cat_mcc < 9702) | (9752 <= cat_mcc) & (cat_mcc < 9754), 21)

    res['mcc_describe'] = a.reset_index(drop=True)
    res['mcc_describe'] = res['mcc_describe'].astype(object)

    res['amount_up'] = res['amount'].where(res['amount'] >= 0)
    a = res['amount_up']
    res['amount_up'] = a.mask(a < a.quantile(0.05), a.quantile(0.05)) \
                        .mask(a > a.quantile(0.95), a.quantile(0.95))
    res['amount_up'] = MinMaxScaler().fit_transform(res[['amount_up']])* 1000
    
    res['amount_down'] = res['amount'].where(res['amount'] <= 0).abs()
    a = res['amount_down']
    res['amount_down'] = a.mask(a < a.quantile(0.05), a.quantile(0.05)) \
                          .mask(a > a.quantile(0.95), a.quantile(0.95))
    res['amount_down'] = MinMaxScaler().fit_transform(res[['amount_down']]) * 1000

    # Характеристика по клиентам заработок и траты
    tmp = res[['client_id', 'amount_up', 'amount_down']].groupby('client_id').agg({'amount_up': ['mean', 'median', 'std', 'count', 'sum'], \
                                                                                   'amount_down': ['mean', 'median', 'std', 'count', 'sum']})
    tmp.columns = tmp.columns.map('{0[0]}_client_{0[1]}'.format)
    res = res.merge(tmp, how='outer', on='client_id')

    # Характеристика по кол-во трат клиентами в дни недели заработок и траты
    aaa = res[['client_id', 'weekday', 'amount_up', 'amount_down']].groupby(['client_id', 'weekday']).count()
    aaa = aaa.unstack(-1)
    aaa.columns = aaa.columns.map('{0[0]}_weekday_{0[1]}'.format)
    res = res.merge(aaa, how='outer', on='client_id')

    # Заработок - траты
    # res['delta+-'] = res['amount_up_client_sum'] - res['amount_down_client_sum']
    # a = res['delta+-']
    # res['delta+-'] = a.mask(a < a.quantile(0.05), a.quantile(0.05)) \
    #                   .mask(a > a.quantile(0.95), a.quantile(0.95))

    res['mcc_code'] = res.mcc_code.astype(object)
    res['trans_type'] = res.trans_type.astype(object)

    tmp = res.groupby('client_id')['term_id'].nunique()
    tmp.name = 'count_term_id'
    res = res.merge(tmp, how='outer', on='client_id')

    # Характеристика по неделям для всех заработок и траты
    tmp = res.groupby('client_id')['trans_type'].nunique()
    tmp.name = 'count_trans_type'
    res = res.merge(tmp, how='outer', on='client_id')

    # Характеристика по неделям для всех заработок и траты
    tmp = res.groupby('client_id')['mcc_code'].nunique()
    tmp.name = 'count_mcc_code'
    res = res.merge(tmp, how='outer', on='client_id')

    # Частота покупок за время существования
    time_client = pd.concat([trans_time, res['client_id']], axis=1)
    abc = time_client.groupby('client_id').agg({'trans_time': ['min', 'max']}).diff(axis=1)
    abc.columns = ['nan', 'days']
    abcde = pd.DataFrame(res['client_id'].value_counts()).merge(abc['days'].dt.days, on='client_id')
    all_time_freq = abcde['days'] / abcde['count']
    all_time_freq.name = 'all_time_freq'
    res = res.merge(all_time_freq, on='client_id')

    res.drop(['amount', 'amount_up', 'amount_down', 'weekday', 'trans_time'], axis=1, inplace=True)

    return res

In [5]:


train = preprocessing_data(transactions_train, transactions)
test = preprocessing_data(transactions_test, transactions)

train.info()
print(train.isna().sum())

test.info()
print(test.isna().sum())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3238732 entries, 0 to 3238731
Data columns (total 35 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   client_id                  object 
 1   mcc_code                   object 
 2   trans_type                 object 
 3   term_id                    object 
 4   trans_city                 object 
 5   gender                     int64  
 6   mcc_describe               object 
 7   amount_up_client_mean      float64
 8   amount_up_client_median    float64
 9   amount_up_client_std       float64
 10  amount_up_client_count     int64  
 11  amount_up_client_sum       float64
 12  amount_down_client_mean    float64
 13  amount_down_client_median  float64
 14  amount_down_client_std     float64
 15  amount_down_client_count   int64  
 16  amount_down_client_sum     float64
 17  amount_up_weekday_0        float64
 18  amount_up_weekday_1        float64
 19  amount_up_weekday_2        float64
 20  am

In [8]:
train.describe()

Unnamed: 0,gender,amount_up_client_mean,amount_up_client_median,amount_up_client_std,amount_up_client_count,amount_up_client_sum,amount_down_client_mean,amount_down_client_median,amount_down_client_std,amount_down_client_count,...,amount_down_weekday_1,amount_down_weekday_2,amount_down_weekday_3,amount_down_weekday_4,amount_down_weekday_5,amount_down_weekday_6,count_term_id,count_trans_type,count_mcc_code,all_time_freq
count,3238732.0,3158285.0,3158285.0,3101451.0,3238732.0,3238732.0,3238690.0,3238690.0,3238663.0,3238732.0,...,3238432.0,3238417.0,3238387.0,3238499.0,3238249.0,3238109.0,3238732.0,3238732.0,3238732.0,3238732.0
mean,0.4820109,216.745,168.0672,203.0557,2676.063,78508.68,177.2123,92.87917,226.6646,642.7153,...,94.99966,96.26439,96.92018,103.2352,87.33165,73.97372,180.1795,14.88811,27.05904,1.049283
std,0.4996764,185.6716,213.9255,120.4668,13399.05,275881.0,175.5052,196.3704,94.4755,507.7618,...,75.85459,77.70289,76.6527,83.06147,69.11825,58.25372,243.6244,5.476054,14.21097,1.419567
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,0.0,83.00778,36.81611,95.399,23.0,3258.265,77.05839,20.35059,155.5205,332.0,...,49.0,49.0,50.0,53.0,45.0,38.0,73.0,11.0,16.0,0.444878
50%,0.0,163.8496,97.01419,197.6408,54.0,9943.201,123.8315,35.37402,212.9447,519.0,...,76.0,77.0,80.0,83.0,70.0,60.0,120.0,15.0,26.0,0.7410423
75%,1.0,296.6036,172.2976,301.873,136.0,31225.63,210.3725,66.70057,290.8623,812.0,...,121.0,122.0,123.0,129.0,109.0,94.0,185.0,19.0,36.0,1.210667
max,1.0,1000.0,1000.0,707.1068,83848.0,2142225.0,1000.0,1000.0,675.0678,4444.0,...,632.0,706.0,696.0,747.0,582.0,479.0,1493.0,36.0,84.0,191.5


In [65]:
res = transactions_train
data = transactions

In [70]:
# Формируем дату
day_time = data['trans_time'].str.split(' ', n=1, expand=True)
day_time.columns = ['day', 'time']
day_time['day'] = day_time['day'].astype(int)

# Стратовая дата
start_date = datetime.datetime(2020, 3, 8, 0, 0, 0) - datetime.timedelta(219)

# Замена времени в исходном датасете с гендерами
trans_time = pd.Series(start_date + pd.to_timedelta(np.ceil(day_time['day']), unit="D"), name='trans_time')
# trans_time.index = res['client_id']

# trans_time.dt.month
# trans_time.dt.day
res['weekday'] = trans_time.dt.weekday
# trans_time.dt.hour

cat_mcc = res["mcc_code"]
cat_mcc.index = res['client_id']
cat_mcc.name = 'mcc_describe'

a = cat_mcc.mask((724 <= cat_mcc) & (cat_mcc < 1799), 1) \
            .mask((1799 <= cat_mcc) & (cat_mcc < 2842) | (4900 <= cat_mcc) & (cat_mcc < 5200) | (5714 <= cat_mcc) & (cat_mcc < 5715) | (9702 <= cat_mcc) & (cat_mcc < 9752), 2) \
            .mask((2842 <= cat_mcc) & (cat_mcc < 3299), 3) \
            .mask((3299 <= cat_mcc) & (cat_mcc < 3441) | (7511 <= cat_mcc) & (cat_mcc < 7519), 4) \
            .mask((3441 <= cat_mcc) & (cat_mcc < 3882) | (6760 <= cat_mcc) & (cat_mcc < 7011), 5) \
            .mask((3882 <= cat_mcc) & (cat_mcc < 4789), 6) \
            .mask((4789 <= cat_mcc) & (cat_mcc < 4900), 7) \
            .mask((5200 <= cat_mcc) & (cat_mcc < 5499), 8) \
            .mask((5499 <= cat_mcc) & (cat_mcc < 5599) | (5699 <= cat_mcc) & (cat_mcc < 5714) | (5969 <= cat_mcc) & (cat_mcc < 5999), 9) \
            .mask((5599 <= cat_mcc) & (cat_mcc < 5699), 10) \
            .mask((5715 <= cat_mcc) & (cat_mcc < 5735) | (5811 <= cat_mcc) & (cat_mcc < 5950), 11) \
            .mask((5735 <= cat_mcc) & (cat_mcc < 5811) | (5999 <= cat_mcc) & (cat_mcc < 6760) | (5962 <= cat_mcc) & (cat_mcc < 5963) | (7011 <= cat_mcc) & (cat_mcc < 7033), 12) \
            .mask((5950 <= cat_mcc) & (cat_mcc < 5962) | (5963 <= cat_mcc) & (cat_mcc < 5969), 13) \
            .mask((7033 <= cat_mcc) & (cat_mcc < 7299), 14) \
            .mask((7299 <= cat_mcc) & (cat_mcc < 7511) | (7519 <= cat_mcc) & (cat_mcc < 7523), 15) \
            .mask((7523 <= cat_mcc) & (cat_mcc < 7699), 16) \
            .mask((7699 <= cat_mcc) & (cat_mcc < 7999), 17) \
            .mask((7999 <= cat_mcc) & (cat_mcc < 8351), 18) \
            .mask((8351 <= cat_mcc) & (cat_mcc < 8699), 19) \
            .mask((8699 <= cat_mcc) & (cat_mcc < 8999), 20) \
            .mask((8999 <= cat_mcc) & (cat_mcc < 9702) | (9752 <= cat_mcc) & (cat_mcc < 9754), 21)

res['mcc_describe'] = a.reset_index(drop=True)
res['mcc_describe'] = res['mcc_describe'].astype(object)

res['amount_up'] = res['amount'].where(res['amount'] >= 0)
a = res['amount_up']
res['amount_up'] = a.mask(a < a.quantile(0.05), a.quantile(0.05)) \
                    .mask(a > a.quantile(0.95), a.quantile(0.95))
res['amount_up'] = MinMaxScaler().fit_transform(res[['amount_up']])* 1000

res['amount_down'] = res['amount'].where(res['amount'] <= 0).abs()
a = res['amount_down']
res['amount_down'] = a.mask(a < a.quantile(0.05), a.quantile(0.05)) \
                      .mask(a > a.quantile(0.95), a.quantile(0.95))
res['amount_down'] = MinMaxScaler().fit_transform(res[['amount_down']]) * 1000


In [69]:
res.describe()

Unnamed: 0,mcc_code,trans_type,amount,weekday,amount_up,amount_down
count,3238732.0,3238732.0,3238732.0,3238732.0,590321.0,2648422.0
mean,5587.873,2464.425,-681.0428,2.914603,3387.447366,1087.406
std,603.6783,2231.56,17665.47,1.934281,5668.065515,1882.733
min,742.0,1000.0,-4338732.0,0.0,71.39,35.12
25%,5211.0,1030.0,-722.32,1.0,278.08,95.71
50%,5541.0,1110.0,-180.74,3.0,783.94,289.0
75%,6010.0,2370.0,-36.84,5.0,3615.01,920.2775
max,9402.0,8146.0,4700293.0,6.0,21694.67,7232.21


In [71]:
res.describe()

Unnamed: 0,mcc_code,trans_type,amount,weekday,amount_up,amount_down
count,3238732.0,3238732.0,3238732.0,3238732.0,590321.0,2648422.0
mean,5587.873,2464.425,-681.0428,2.914603,153.355891,146.2099
std,603.6783,2231.56,17665.47,1.934281,262.127925,261.5964
min,742.0,1000.0,-4338732.0,0.0,0.0,0.0
25%,5211.0,1030.0,-722.32,1.0,9.558679,8.41868
50%,5541.0,1110.0,-180.74,3.0,32.95291,35.27537
75%,6010.0,2370.0,-36.84,5.0,163.879855,122.9882
max,9402.0,8146.0,4700293.0,6.0,1000.0,1000.0


In [11]:
train = preprocessing_data(train, transactions)
test = preprocessing_data(test, transactions)

train.info()
test.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['weekday'] = trans_time.dt.weekday
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['mcc_describe'] = a.reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['mcc_describe'] = res['mcc_describe'].astype(object)
A value is trying to be set on a copy of a slice from a DataFr

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2587293 entries, 0 to 2587292
Data columns (total 35 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   client_id                  object 
 1   mcc_code                   object 
 2   trans_type                 object 
 3   term_id                    object 
 4   trans_city                 object 
 5   gender                     int64  
 6   mcc_describe               object 
 7   amount_up_client_mean      float64
 8   amount_up_client_median    float64
 9   amount_up_client_std       float64
 10  amount_up_client_count     int64  
 11  amount_up_client_sum       float64
 12  amount_down_client_mean    float64
 13  amount_down_client_median  float64
 14  amount_down_client_std     float64
 15  amount_down_client_count   int64  
 16  amount_down_client_sum     float64
 17  amount_up_weekday_0        float64
 18  amount_up_weekday_1        float64
 19  amount_up_weekday_2        float64
 20  am

In [12]:

cat_features = ['mcc_code', 'trans_type', 'trans_city', 'mcc_describe']

model = CatBoostClassifier(
    iterations=1400,
    random_seed=63,
    learning_rate=0.0095,
    custom_loss='AUC',
    verbose=100
)
model.fit(
    train.drop(['term_id', 'client_id'], axis=1), train['gender'],
    cat_features=cat_features,
    eval_set=(test.drop(['term_id', 'client_id'], axis=1), test['gender']),
    plot=True
)
# model.fit(
#     train.drop(['term_id', 'client_id'], axis=1), label,
#     plot=True,
#     cat_features=cat_features
# )
# model.save_model('catboost_model2.json', format='json')

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=4]=7.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [5]:

# from sklearn.model_selection import train_test_split
# X_train, X_validation, y_train, y_validation = train_test_split(train.drop(['term_id', 'client_id'], axis=1), label,
#                                                                 train_size=0.7, random_state=234)
cat_features = ['mcc_code', 'trans_type', 'trans_city', 'mcc_describe']

model = CatBoostClassifier(
    iterations=1400,
    random_seed=63,
    learning_rate=0.0095,
    custom_loss='AUC',
    verbose=100
)
model.fit(
    train.drop(['term_id', 'client_id'], axis=1), label,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    plot=True
)
# model.fit(
#     train.drop(['term_id', 'client_id'], axis=1), label,
#     plot=True,
#     cat_features=cat_features
# )
# model.save_model('catboost_model2.json', format='json')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6888993	total: 1.12s	remaining: 26m 6s
100:	learn: 0.5836767	total: 1m 24s	remaining: 18m 12s
200:	learn: 0.5491254	total: 2m 52s	remaining: 17m 11s
300:	learn: 0.5244616	total: 4m 17s	remaining: 15m 38s
400:	learn: 0.5006445	total: 5m 44s	remaining: 14m 18s
500:	learn: 0.4786061	total: 7m 17s	remaining: 13m 4s
600:	learn: 0.4601352	total: 8m 53s	remaining: 11m 49s
700:	learn: 0.4420997	total: 10m 24s	remaining: 10m 22s
800:	learn: 0.4249564	total: 11m 54s	remaining: 8m 54s
900:	learn: 0.4100539	total: 13m 26s	remaining: 7m 26s
1000:	learn: 0.3966388	total: 15m 3s	remaining: 5m 59s
1100:	learn: 0.3835251	total: 16m 43s	remaining: 4m 32s
1200:	learn: 0.3705442	total: 18m 18s	remaining: 3m 2s
1300:	learn: 0.3571586	total: 19m 52s	remaining: 1m 30s
1399:	learn: 0.3458609	total: 21m 24s	remaining: 0us


In [38]:
from itertools import product
iterations = [1400]
learning_rate = [0.025, 0.015]

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train.drop(['term_id', 'client_id'], axis=1), label,
                                                                train_size=0.7, random_state=234)
cat_features = ['mcc_code', 'trans_type', 'trans_city', 'mcc_describe']


for itera, lr in product(iterations, learning_rate):
    print(itera, lr)
    model = CatBoostClassifier(
        iterations=itera,
        random_seed=63,
        learning_rate=lr,
        custom_loss='AUC',
        verbose=100
    )
    model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_validation, y_validation),
        plot=True
    )
    
    result = model.predict_proba(test.drop(['term_id', 'client_id'], axis=1))[:, 0]
    submission = pd.concat([transactions_test[['client_id']], pd.DataFrame(result, columns=['probability'])], axis=1)
    print(submission)
    submission.to_csv('test_submission.csv')
    
    model.save_model(f'model_itera_{itera}_lr_{lr}.json', format='json')

1400 0.025


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6829967	test: 0.6829676	best: 0.6829676 (0)	total: 2.68s	remaining: 1h 2m 35s


KeyboardInterrupt: 

In [41]:
cat_features = ['mcc_code', 'trans_type', 'trans_city', 'mcc_describe']

model = CatBoostClassifier(
    random_seed=63,
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=100
)

model.fit(
    train.drop(['term_id', 'client_id'], axis=1), label,
    cat_features=cat_features
)

model.save_model(f'model_default_loss_func.json', format='json')

Learning rate set to 0.324986
0:	total: 2.23s	remaining: 37m 10s
100:	total: 1m 43s	remaining: 15m 24s
200:	total: 3m 16s	remaining: 13m 2s
300:	total: 4m 54s	remaining: 11m 22s
400:	total: 6m 28s	remaining: 9m 39s
500:	total: 8m 1s	remaining: 7m 59s
600:	total: 9m 47s	remaining: 6m 29s
700:	total: 11m 13s	remaining: 4m 47s
800:	total: 12m 28s	remaining: 3m 6s
900:	total: 13m 41s	remaining: 1m 30s
999:	total: 14m 47s	remaining: 0us


In [27]:
# from catboost import cv
# from catboost import Pool
#
# params = {}
# params['loss_function'] = 'Logloss'
# params['iterations'] = 1350
# params['custom_loss'] = 'AUC'
# params['random_seed'] = 63
# params['learning_rate'] = 0.0095
#
# cv_data = cv(
#     params = params,
#     pool = Pool(train.drop(['term_id', 'client_id'], axis=1),
#     label=label,
#     cat_features=cat_features),
#     fold_count=5,
#     shuffle=True,
#     partition_random_seed=0,
#     plot=True,
#     stratified=True,
#     verbose=False
# )

result = model.predict_proba(test.drop(['term_id', 'client_id'], axis=1))[:, 0]
submission = pd.concat([transactions_test[['client_id']], pd.DataFrame(result, columns=['probability'])], axis=1)
print(submission)
submission.to_csv('test_submission.csv')


# print(train[train['delta+-'] == -4629928.72])


# model = CatBoostClassifier()
# model.load_model('catboost_model.bin')
#
# result2 = model.predict_proba(train.drop(['term_id', 'client_id'], axis=1))
# #
# submission = pd.DataFrame(index=transactions_test['client_id'], data=result2, columns=['probability'])
# print(submission)
#
# submission.to_csv('test_submission.csv')

CatBoostError: There is no trained model to use predict_proba(). Use fit() to train model. Then use this method.