In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os

# Solved with index_col='client_id' 

In [2]:
PATH_DATA = './data'

# Читаем данные транзакций
transactions = pd.read_csv(os.path.join(PATH_DATA, 'transactions.csv'), header=0, index_col='client_id')
gender_train = pd.read_csv(os.path.join(PATH_DATA, 'train.csv'), header=0, index_col='client_id')
# gender_test = pd.read_csv(os.path.join(PATH_DATA, 'test.csv'), header=0, index_col='client_id')

# Парсинг времени
transactions['trans_time'] = (
    datetime.datetime(2020, 3, 8, 0, 0, 0)
    - datetime.timedelta(219)
    + pd.to_timedelta(transactions['trans_time'].str.replace(' ', ' days ', n=1))
)

# Разбивка на train test
transactions_train = transactions.join(gender_train, how='inner').drop('Unnamed: 0', axis=1)
# transactions_test = transactions.join(gender_test, how='inner').drop('Unnamed: 0', axis=1)

# Разбивка на train и test

In [3]:
razbivka = transactions_train.groupby(['gender', 'client_id']).count().reset_index().set_index('client_id').sample(n=1250, random_state=42)
train = transactions_train[~transactions_train.index.isin(razbivka.index)]
test = transactions_train[transactions_train.index.isin(razbivka.index)]

In [4]:
test_target = test['gender']

test = test.drop('gender', axis=1)

In [5]:
dataset = pd.concat([train, test])

# Функции

In [6]:
def preprocessing_data(data: pd.DataFrame) -> pd.DataFrame:
    
    # Выделяем дни недели
    data['weekday'] = data['trans_time'].dt.weekday
    
    # Делем amount на + и -
    data['amount_up'] = data['amount'].where(data['amount'] >= 0)
    data['amount_down'] = data['amount'].where(data['amount'] <= 0).abs()
    
    # Добавляем колличество МСС кодов по клиенту
    tmp = (
        data.groupby(['client_id', 'mcc_code'])['mcc_code']
            .count()
            .rename('mcc_code_count')
            .reset_index()
    )
    data = pd.merge(
        data.reset_index(),
        tmp,
        on=['client_id', 'mcc_code'],
        how="inner"
    ).set_index('client_id')
    
    # Важность МСС кода по полу
    tmp = pd.pivot_table(data,
                          index='mcc_code',
                          values="amount",
                          columns='gender',
                          aggfunc='count').fillna(0)
    tmp['diff'] = tmp[[0, 1]].diff(axis=1).dropna(axis=1)
    tmp['sum'] = tmp[[0, 1]].sum(axis=1)
    tmp['div'] = tmp['diff'] / tmp['sum']
    data = (
        data.reset_index()
            .merge(tmp['div'], on='mcc_code', how='inner')
            .set_index('client_id')
    )
    
    # Другая характеристика по клиенту
    tmp = (
        data
        .groupby('client_id')
        .agg({
             'amount_up': ['mean', 'median', 'std', 'count', 'sum'],
             'amount_down': ['mean', 'median', 'std', 'count', 'sum'],
             'trans_type': 'nunique',
             'mcc_code': 'nunique'
        })
    )
    tmp.columns = ['_client_'.join(col).strip() for col in tmp.columns.values]
    data = data.join(tmp, how='inner')
    
    # Другая характеристика по клиенту номер 2
    tmp = (
        data
        .groupby('mcc_code')
        .agg({
             'amount_up': ['mean', 'median', 'std', 'count', 'sum'],
             'amount_down': ['mean', 'median', 'std', 'count', 'sum']
        })
    )
    tmp.columns = ['_mcc_'.join(map(str, col)).strip() for col in tmp.columns.values]
    data = data.reset_index().merge(tmp, on='mcc_code', how='inner').set_index('client_id')
    
    # Заработок - траты
    data['delta+-'] = data['amount_up_client_sum'] - data['amount_down_client_sum']
    
    # Частота покупок за время существования
    days_df = data.groupby('client_id').agg({'trans_time': ['min', 'max']}).diff(axis=1).dropna(axis=1)
    days_df.columns = ['days']
    all_time_freq = (data.index.value_counts() / days_df['days'].dt.days).rename('all_time_freq')
    data = data.join(all_time_freq, how='inner')
    
    # Характеристика по кол-во трат клиентами в дни недели заработок и траты
    aaa = data.groupby(['client_id', 'weekday'])[['amount_up', 'amount_down']].count()
    aaa = aaa.unstack(-1)
    aaa.columns = aaa.columns.map('{0[0]}_weekday_{0[1]}'.format)
    data = data.join(aaa, how='inner')
    data['amount_mean_up_weekday'] = data[[x for x in data.columns if "amount_up_weekday" in x]].mean(axis=1)
    data['amount_mean_down_weekday'] = data[[x for x in data.columns if "amount_down_weekday" in x]].mean(axis=1)
    
    
    # Убираем лишние столбцы
    data.drop(['amount', 'weekday', 'trans_time', 'term_id'], axis=1, inplace=True)
    
    # Убираем inf
    data.replace([np.inf, -np.inf], 0, inplace=True)
    
    # Приведение типов
    data = data.astype({'mcc_code': 'category',
                        'trans_type': 'category',
                        'trans_city': 'category',
                        'gender': 'category'})
    
    # tmp = MinMaxScaler().fit_transform(data.select_dtypes(exclude=['category']))
    # tmp = pd.DataFrame(tmp, columns=data.select_dtypes(exclude=['category']).columns)
    # indexes = data.index
    # data = pd.concat([data.select_dtypes(include=['category']).reset_index(), tmp], axis=1).set_index('client_id')

    return data

# Эксперимент по слиянию пользователей

In [7]:
dataset_new2 = preprocessing_data(transactions_train)

In [8]:
dataset_new2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3238732 entries, 0002cf30347684df542e1a931f356875 to fffedf876a0ea3d39e54b706165a4826
Data columns (total 48 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   mcc_code                   category
 1   trans_type                 category
 2   trans_city                 category
 3   gender                     category
 4   amount_up                  float64 
 5   amount_down                float64 
 6   mcc_code_count             int64   
 7   div                        float64 
 8   amount_up_client_mean      float64 
 9   amount_up_client_median    float64 
 10  amount_up_client_std       float64 
 11  amount_up_client_count     int64   
 12  amount_up_client_sum       float64 
 13  amount_down_client_mean    float64 
 14  amount_down_client_median  float64 
 15  amount_down_client_std     float64 
 16  amount_down_client_count   int64   
 17  amount_down_client_sum     float64 
 18  trans_type_

In [9]:
dataset_new2_decimal = dataset_new2.select_dtypes(exclude=['category']).groupby('client_id').mean()

In [10]:
# dataset_new2_decimal = dataset_new2.astype({'mcc_code':'int32'}).select_dtypes(exclude=['category']).reset_index().groupby(['client_id', 'mcc_code'], as_index=False).mean().set_index('client_id')

In [11]:
dataset_new2_category = dataset_new2.select_dtypes(include=['category']).groupby('client_id').agg(lambda x: x.mode(dropna=False)[0])

In [12]:
dataset_new2_res = dataset_new2_decimal.join(dataset_new2_category)

In [13]:
razbivka = dataset_new2_res.groupby(['gender', 'client_id']).count().reset_index().set_index('client_id').sample(n=2268, random_state=42)
train = dataset_new2_res[~dataset_new2_res.index.isin(razbivka.index)]
test = dataset_new2_res[dataset_new2_res.index.isin(razbivka.index)]

In [14]:
model = CatBoostClassifier(
    # iterations=20000,
    random_seed=42,
    # learning_rate=0.001,
    custom_loss='AUC',
    verbose=False
)
model.fit(
    train.drop('gender', axis=1), train.gender,
    cat_features=['mcc_code', 'trans_type', 'trans_city'],
    eval_set=(test.drop('gender', axis=1), test.gender),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1466b1450>