In [198]:
def preprocessing_data(data: pd.DataFrame) -> pd.DataFrame:

    # Выделяем дни недели
    data['weekday'] = data['trans_time'].dt.weekday
    
    # Делем amount на + и -
    data['amount_up'] = data['amount'].where(data['amount'] >= 0)
    data['amount_down'] = data['amount'].where(data['amount'] <= 0).abs()
    
    # Добавляем колличество МСС кодов по клиенту
    tmp = (
        data.groupby(['client_id', 'mcc_code'])['mcc_code']
            .count()
            .rename('mcc_code_count')
            .reset_index()
    )
    data = pd.merge(
        data.reset_index(),
        tmp,
        on=['client_id', 'mcc_code'],
        how="inner"
    ).set_index('client_id')
    
    # Важность МСС кода по полу
    tmp = pd.pivot_table(data,
                          index='mcc_code',
                          values="amount",
                          columns='gender',
                          aggfunc='count').fillna(0)
    tmp['diff'] = tmp[[0, 1]].diff(axis=1).dropna(axis=1)
    tmp['sum'] = tmp[[0, 1]].sum(axis=1)
    tmp['div'] = tmp['diff'] / tmp['sum']
    data = (
        data.reset_index()
            .merge(tmp['div'], on='mcc_code', how='inner')
            .set_index('client_id')
    )
    
    # Другая характеристика по клиенту
    tmp = (
        data[['amount_up', 'amount_down', 'trans_type', 'mcc_code']]
        .groupby('client_id')
        .agg({
             'amount_up': ['mean', 'median', 'std', 'count', 'sum'],
             'amount_down': ['mean', 'median', 'std', 'count', 'sum'],
             'trans_type': 'nunique',
             'mcc_code': 'nunique'
        })
    )
    tmp.columns = ['_client_'.join(col).strip() for col in tmp.columns.values]
    data = data.join(tmp, how='inner')
    
    # Заработок - траты
    data['delta+-'] = data['amount_up_client_sum'] - data['amount_down_client_sum']
    
    # Частота покупок за время существования
    days_df = data.groupby('client_id').agg({'trans_time': ['min', 'max']}).diff(axis=1).dropna(axis=1)
    days_df.columns = ['days']
    all_time_freq = (data.index.value_counts() / days_df['days'].dt.days).rename('all_time_freq')
    data = data.join(all_time_freq, how='inner')
    
    # Убираем лишние столбцы
    data.drop(['amount', 'weekday', 'trans_time', 'term_id'], axis=1, inplace=True)
    
    # Приведение типов
    data = data.astype({'mcc_code': 'category', 'trans_type': 'category'})

    return data

# Разбивка на train и test

In [93]:
razbivka = transactions_train.groupby(['gender', 'client_id']).count().reset_index().set_index('client_id').sample(n=1250)
train = transactions_train[~transactions_train.index.isin(razbivka.index)]
test = transactions_train[transactions_train.index.isin(razbivka.index)]

In [94]:
display(train, test)

Unnamed: 0_level_0,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender,weekday,amount_up,amount_down
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0002cf30347684df542e1a931f356875,2020-06-10 14:52:03,4829,2370,-2170.07,888990,Saint Petersburg,0,2,,2170.07
0002cf30347684df542e1a931f356875,2019-12-30 14:10:49,6011,2010,-1445.64,,Saint Petersburg,0,0,,1445.64
0002cf30347684df542e1a931f356875,2019-12-02 12:38:32,5912,1010,-107.07,,Saint Petersburg,0,0,,107.07
0002cf30347684df542e1a931f356875,2020-01-08 13:22:34,6011,2010,-2892.86,,Saint Petersburg,0,2,,2892.86
0002cf30347684df542e1a931f356875,2020-04-15 12:06:54,5912,1010,-164.49,469965,Saint Petersburg,0,2,,164.49
...,...,...,...,...,...,...,...,...,...,...
fffedf876a0ea3d39e54b706165a4826,2019-08-30 12:51:47,6011,2010,-10846.43,,Saint Petersburg,1,4,,10846.43
fffedf876a0ea3d39e54b706165a4826,2019-09-26 07:37:08,6011,2010,-2170.15,,Saint Petersburg,1,3,,2170.15
fffedf876a0ea3d39e54b706165a4826,2019-09-26 07:40:10,4814,1030,-71.41,,Saint Petersburg,1,3,,71.41
fffedf876a0ea3d39e54b706165a4826,2020-02-06 15:05:52,6011,2010,-2169.75,,Saint Petersburg,1,3,,2169.75


Unnamed: 0_level_0,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender,weekday,amount_up,amount_down
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0002f4661e0d4d2ae097d78552dad022,2019-11-11 17:15:59,5411,1110,-75.04,,Novosibirsk,0,0,,75.04
0002f4661e0d4d2ae097d78552dad022,2020-06-05 20:57:38,5499,1010,-56.16,543894,Novosibirsk,0,4,,56.16
0002f4661e0d4d2ae097d78552dad022,2020-06-19 14:02:54,6011,2010,-145.32,322185,Novosibirsk,0,4,,145.32
0002f4661e0d4d2ae097d78552dad022,2020-07-24 17:38:19,4816,1100,-71.82,03170931,Novosibirsk,0,4,,71.82
0002f4661e0d4d2ae097d78552dad022,2020-02-29 20:29:06,5813,1110,-143.63,569557,Novosibirsk,0,5,,143.63
...,...,...,...,...,...,...,...,...,...,...
fffbed6c440a05e5593ad1a865e586a8,2020-01-23 15:17:00,5331,1010,-44.75,,Vladivostok,1,3,,44.75
fffbed6c440a05e5593ad1a865e586a8,2020-08-24 22:48:37,4814,1030,-44.26,888986,Vladivostok,1,0,,44.26
fffbed6c440a05e5593ad1a865e586a8,2020-05-12 00:00:40,5541,1110,-650.67,00480618,Vladivostok,1,1,,650.67
fffbed6c440a05e5593ad1a865e586a8,2019-11-25 12:52:54,6011,7010,1446.00,,Vladivostok,1,0,1446.00,
