# Ноутбук с моделью

## Импортируем необходимые пакеты

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import fastparquet
import os
import tqdm
import lightgbm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

## Считываем данные в датафреймы

In [2]:
# Считываем первый файл с тренировочной выборкой для ознакомления с данными
df_raw_0 = pd.read_parquet('train_data/train_data_0.pq', engine='fastparquet')
df_raw_0.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0


In [4]:
columns_to_read = list(df_raw_0.columns)

In [11]:
# путь до данных на компьютере
train_data_path = './train_data/'

In [14]:
# Считываем данные из архива с помощью функции
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    """
    читает num_parts_to_read партиций, преобразовывает их к pd.DataFrame и возвращает
    :param path_to_dataset: путь до директории с партициями
    :param start_from: номер партиции, с которой нужно начать чтение
    :param num_parts_to_read: количество партиций, которые требуется прочитать
    :param columns: список колонок, которые нужно прочитать из партиции
    :return: pd.DataFrame
    """

    res = []
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                              if filename.startswith('train')])
    print(dataset_paths)

    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        print('Reading chunks:\n')
        for chunk in chunks:
            print(chunk)
    for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
        print('chunk_path', chunk_path)
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)

In [8]:
df_raw_0 = read_parquet_dataset_from_local(path_to_dataset=path, start_from = 0,
                                     num_parts_to_read = 1, columns=columns_to_read, verbose=False)
df_raw_0.head()

['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_0.pq


Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0


In [3]:
targets = pd.read_csv('train_target.csv')
targets.head(5)

Unnamed: 0,id,flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


## EDA

In [11]:
# Сначала посмотрим на датафрейм с таргетом
targets.shape

(3000000, 2)

In [10]:
targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 2 columns):
 #   Column  Dtype
---  ------  -----
 0   id      int64
 1   flag    int64
dtypes: int64(2)
memory usage: 45.8 MB


In [12]:
targets.flag.value_counts()

flag
0    2893558
1     106442
Name: count, dtype: int64

In [14]:
# Выборка сильно несбалансирована. Необходимо будет применять методы балансировки
# Проверим на пропуски
targets.isna().sum()

id      0
flag    0
dtype: int64

In [16]:
# Проверим на дубликаты
targets.duplicated().sum()

0

In [17]:
# Анализируем датафрейм с фичами
df_raw_0.shape

(1974724, 61)

In [19]:
df_raw_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1974724 entries, 0 to 1974723
Data columns (total 61 columns):
 #   Column                         Dtype
---  ------                         -----
 0   id                             int64
 1   rn                             int64
 2   pre_since_opened               int64
 3   pre_since_confirmed            int64
 4   pre_pterm                      int64
 5   pre_fterm                      int64
 6   pre_till_pclose                int64
 7   pre_till_fclose                int64
 8   pre_loans_credit_limit         int64
 9   pre_loans_next_pay_summ        int64
 10  pre_loans_outstanding          int64
 11  pre_loans_total_overdue        int64
 12  pre_loans_max_overdue_sum      int64
 13  pre_loans_credit_cost_rate     int64
 14  pre_loans5                     int64
 15  pre_loans530                   int64
 16  pre_loans3060                  int64
 17  pre_loans6090                  int64
 18  pre_loans90                    int64
 19  

In [20]:
# Проверим на пропуски
df_raw_0.isna().sum().sum()

0

In [54]:
# Проверим на дубликаты
df_raw_0.duplicated().sum()

0

In [24]:
# Для лучшего понимания данных посмотрим, какая информация имеется для id = 0
df_raw_0[df_raw_0['id'] == 0]

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
5,0,6,5,0,11,8,12,11,4,2,...,3,3,3,4,1,2,3,1,0,1
6,0,7,3,9,1,2,12,14,15,5,...,3,3,3,4,1,3,4,1,0,0
7,0,8,2,9,2,3,12,14,15,5,...,3,3,3,4,1,3,4,1,0,0
8,0,9,1,9,11,13,14,8,2,5,...,3,3,3,4,1,2,4,1,0,0
9,0,10,7,9,2,10,8,8,16,4,...,3,3,3,4,1,2,4,1,0,0


In [25]:
# Перед объединением с таргетом потребуется группировать данный датафрейм по id с различной логикой агрегирования по 
# каждому признаку

In [29]:
# Посмотрим общую статистику
df_raw_0.describe()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
count,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,...,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0,1974724.0
mean,125946.5,6.537471,9.271325,8.330264,8.299071,8.309444,6.991781,8.337011,9.740888,2.399848,...,2.313242,2.343014,2.376717,3.576724,1.027562,2.694898,3.353936,1.002386,0.1721385,0.2239336
std,72269.81,5.121167,5.773757,4.593511,5.245598,4.464844,5.184165,4.233992,5.799625,1.376391,...,1.247674,1.228473,1.205555,1.035586,0.2767022,0.4891363,1.057304,0.05909327,0.3775009,0.4168782
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,63441.0,3.0,4.0,5.0,4.0,6.0,1.0,5.0,5.0,2.0,...,3.0,3.0,3.0,4.0,1.0,2.0,3.0,1.0,0.0,0.0
50%,126375.0,5.0,9.0,9.0,8.0,8.0,6.0,10.0,10.0,2.0,...,3.0,3.0,3.0,4.0,1.0,3.0,4.0,1.0,0.0,0.0
75%,188997.0,9.0,14.0,11.0,13.0,12.0,12.0,11.0,15.0,2.0,...,3.0,3.0,3.0,4.0,1.0,3.0,4.0,1.0,0.0,0.0
max,249999.0,51.0,19.0,17.0,17.0,16.0,16.0,15.0,19.0,6.0,...,3.0,3.0,3.0,4.0,6.0,6.0,5.0,3.0,1.0,1.0


In [61]:
# Проверим есть ли отрицательные значения в данных (ошибки)
def negative_numbers(df):
    count = 0
    for col in df.columns:
        if df[col].min() < 0:
            count += 1
    return count

In [36]:
count = negative_numbers(df_raw_0)
count

0

In [5]:
binary_cols = [
        'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90',
        'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'pclose_flag', 'fclose_flag'
                  ]

In [45]:
# Проверим наличие ошибок в бинарных признаках
def binary_features_check(df):
    binary_cols = [
        'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90',
        'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'pclose_flag', 'fclose_flag'
                  ]
    list_0_1 = [0, 1]
    mistaken_cols = []
    for col in binary_cols:
        if list_0_1 != sorted(list(df[col].unique())):
            mistaken_cols.append(col)
    return mistaken_cols

In [46]:
mistaken_cols = binary_features_check(df_raw_0)
mistaken_cols

[]

In [47]:
# Ошибок нет
# Посмотрим есть ли выбросы в данных. Проверяем все признаки кроме бинарных
def calculate_iqr_boundaries(series):
    q25 = series.quantile(0.25)
    q75 = series.quantile(0.75)
    iqr = q75 - q25

    boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
    return boundaries

In [50]:
columns = columns_to_read.copy()
for col in binary_cols:
    columns.remove(col)
for column in columns:
    bounds = calculate_iqr_boundaries(df_raw_0[column])
    n_outliers = df_raw_0[(df_raw_0[column] < bounds[0]) | (df_raw_0[column] > bounds[1])].shape[0]    
    print(f'Границы в признаке {column} - {bounds}')
    print(f'Минимум и максимум в признаке {column} - {(df_raw_0[column].min(), df_raw_0[column].max())}')
    print(f'Количество выбросов в признаке {column} - {n_outliers}')
    print('   ')

Границы в признаке id - (-124893.0, 377331.0)
Минимум и максимум в признаке id - (0, 249999)
Количество выбросов в признаке id - 0
   
Границы в признаке rn - (-6.0, 18.0)
Минимум и максимум в признаке rn - (1, 51)
Количество выбросов в признаке rn - 64592
   
Границы в признаке pre_since_opened - (-11.0, 29.0)
Минимум и максимум в признаке pre_since_opened - (0, 19)
Количество выбросов в признаке pre_since_opened - 0
   
Границы в признаке pre_since_confirmed - (-4.0, 20.0)
Минимум и максимум в признаке pre_since_confirmed - (0, 17)
Количество выбросов в признаке pre_since_confirmed - 0
   
Границы в признаке pre_pterm - (-9.5, 26.5)
Минимум и максимум в признаке pre_pterm - (0, 17)
Количество выбросов в признаке pre_pterm - 0
   
Границы в признаке pre_fterm - (-3.0, 21.0)
Минимум и максимум в признаке pre_fterm - (0, 16)
Количество выбросов в признаке pre_fterm - 0
   
Границы в признаке pre_till_pclose - (-15.5, 28.5)
Минимум и максимум в признаке pre_till_pclose - (0, 16)
Количест

In [51]:
# Выбросы в данных есть. Однако ввиду природы этих данных (данные закодированы) невозможно со 100 % точностью утверждать,
# что это на самом деле выбросы

In [52]:
# Теперь необходимо проверить остальные датафреймы. Будем проверять в следующей последовательности:
# 1. Проверка на пропуски
# 2. Проверка на дубли
# 3. Проверка на ошибки в бинарных признаках
# 4. Проверка на отрицательные значения в данных
# Проверку будем проводить в цикле. Результат записывать в словарь
# Поскольку первую порцию данных уже проверили, не будем ее включать в проверку

In [62]:
analysis_dict = {}
for num in range(1, 12):
    df = read_parquet_dataset_from_local(path_to_dataset=path, start_from = num,
                                     num_parts_to_read = 1, columns=columns_to_read, verbose=False)
    results = {}
    results['gaps'] = df.isna().sum().sum()
    results['duplicates'] = df.duplicated().sum()
    results['mistaken_cols'] = binary_features_check(df)
    results['negative_numbers'] = negative_numbers(df)   
    analysis_dict[num] = results

['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_1.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_10.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_11.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_2.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_3.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_4.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_5.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_6.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_7.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_8.pq
['train_data/train_data_0.pq', 'train_data/train_data_1.pq', 'train_data/train_data_10.pq', 'train_data/train_data_11.pq', 'train_data/train_data_2.pq', 'train_data/train_data_3.pq', 'train_data/train_data_4.pq', 'train_data/train_data_5.pq', 'train_data/train_data_6.pq', 'train_data/train_data_7.pq', 'train_data/train_data_8.pq', 'train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path train_data/train_data_9.pq


In [63]:
analysis_dict

{1: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 2: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 3: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 4: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 5: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 6: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 7: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 8: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 9: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 10: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0},
 11: {'gaps': 0, 'duplicates': 0, 'mistaken_cols': [], 'negative_numbers': 0}}

In [64]:
# Проверка завершена. Пропусков, дубликатов, отризательных значений и ошибочных бинарных значений в данных нет
# Для проведения анализа корреляции признаков от целевой переменной необходимо сначала преобразовать данные

## Data Preparation

### Experiments

In [None]:
# Будем проводить эксперименты с целью максимации метрики roc_auc
# Для этого будем изменять подходы к feature engineering и data preparation, а также подбирать гиперпараметры моделей
# Оценку будем проводить на одной выборке и трех моделях

In [6]:
# Эксперимент 1. Сперва закодируем все фичи кроме id и бинарных признаков, а потом сгруппируем.

In [14]:
columns_to_encode_1 = columns_to_read.copy()
columns_to_remove_1 = binary_cols.copy()
columns_to_remove_1.append('id')
for col in columns_to_remove_1:
    columns_to_encode_1.remove(col)

In [None]:
# Кодируем данные. Используем OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_features = ohe.fit_transform(df_raw_0[columns_to_encode_1])
df_encoded_features = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out())
df_encoded = pd.concat([df_raw_0, df_encoded_features], axis=1)
df_encoded.head()

In [25]:
df_cut = df_encoded.drop(columns_to_encode_1, axis=1)
df_cut.head()

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_credit_status_5,enc_loans_credit_status_6,enc_loans_credit_type_1,enc_loans_credit_type_2,enc_loans_credit_type_3,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3
0,0,1,1,1,1,1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0,1,1,1,1,1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0,1,1,1,1,1,0,1,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0,0,1,1,1,1,1,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,1,1,1,1,1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [26]:
df_cut.to_csv('df_cut.csv', index = False)

In [2]:
df_cut = pd.read_csv('df_cut.csv')
df_cut.shape

(1974724, 387)

In [4]:
# Бинарные фичи будем аггрегировать по сумме
# Остальные фичи будем аггрегировать по медиане
# Напишем функцию
def agg_methods(df, agg_func_1, agg_func_2):
    binary_cols = [
        'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90',
        'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'pclose_flag', 'fclose_flag'
                  ]
    columns_to_remove = binary_cols.copy()
    columns_to_remove.append('id')
    nonbinary_columns = list(df_cut.columns)
    for col in columns_to_remove:
        nonbinary_columns.remove(col)
    agg_dict = {}
    for col in binary_cols:
        agg_dict[col] = agg_func_1
    for col in nonbinary_columns:
        agg_dict[col] = agg_func_2
    return agg_dict

In [5]:
agg_dict = agg_methods(df_cut, 'sum', 'median')

In [6]:
df_encoded_groupped = df_cut.groupby(['id'], as_index=False).agg(agg_dict)
df_encoded_groupped.shape

(250000, 387)

In [12]:
df_encoded_groupped.head()

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_credit_status_5,enc_loans_credit_status_6,enc_loans_credit_type_1,enc_loans_credit_type_2,enc_loans_credit_type_3,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3
0,0,9,10,10,10,10,6,9,9,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1,12,10,12,12,11,10,12,11,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,2,3,2,2,2,3,1,3,2,2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,3,15,15,15,15,15,8,14,14,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,4,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [13]:
df_encoded_groupped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Columns: 387 entries, id to enc_loans_account_cur_3
dtypes: float64(376), int64(11)
memory usage: 738.1 MB


In [7]:
df_prepared_1 = pd.merge(left=df_encoded_groupped, right=targets, on='id', how='inner').drop('id', axis=1)
df_prepared_1.shape

(250000, 387)

In [8]:
# Сперва проведем оценку моделей с имеющимся подготовленным датафреймом
# Выделим таргет в отдельный датафрейм
X = df_prepared_1.drop('flag', axis=1)
y = df_prepared_1['flag']

In [6]:
random_seed = 1

In [18]:
# Проведем оценку логистической модели
random_seed = 1
log_model = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=random_seed)

In [19]:
score = cross_val_score(
    log_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.7066, roc_auc_std: 0.0044


In [19]:
# Проведем оценку модели случайного леса
random_seed = 1
forest_model = RandomForestClassifier(class_weight='balanced', random_state=random_seed)

In [21]:
score = cross_val_score(
    forest_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.6532, roc_auc_std: 0.0057


In [9]:
# Пробуем модель MLP
RANDOM_SEED=1
MLP_model = MLPClassifier(random_state=RANDOM_SEED, activation='logistic', max_iter=2000, hidden_layer_sizes=(400,), learning_rate_init=0.002, learning_rate='adaptive')

In [10]:
score = cross_val_score(
    MLP_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.5747, roc_auc_std: 0.0121


In [None]:
# Эксперимент 2. Изменим агрегирующие функции для признаков. Будем агрегировать по сумме все признаки
df_exp1 = df_cut.groupby(['id'], as_index=False).agg('sum')
df_prepared_exp1 = pd.merge(left=df_exp1, right=targets, on='id', how='inner').drop('id', axis=1)

In [30]:
df_prepared_exp1.to_csv('df_prepared_exp1.csv')

In [None]:
X = df_prepared_exp1.drop('flag', axis=1)
y = df_prepared_exp1['flag']

In [22]:
# Проверяем как влияют изменения на метрику модели логистической регрессии
score = cross_val_score(
    log_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.7473, roc_auc_std: 0.0070


In [23]:
# Проверяем как влияют изменения на метрику модели случайного леса
score = cross_val_score(
    forest_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.7020, roc_auc_std: 0.0051


In [None]:
# Пробуем модель MLP
RANDOM_SEED=1
MLP_model = MLPClassifier(random_state=RANDOM_SEED, activation='logistic', max_iter=2000, hidden_layer_sizes=(400,), learning_rate_init=0.002)
score = cross_val_score(
    MLP_model,
    X,
    np.ravel(y),                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

In [17]:
# Изменение функции агрегации дало прирост метрики roc auc на 0.05
# Считаем эксперимент удачным

In [9]:
# Эксперимент 3.
# Теперь пробуем сначала агрегировать данные, а затем кодировать
# Агрегировать будем по сумме:
df_groupped = df_raw_0.groupby(['id'], as_index=False).agg('sum')

In [10]:
df_groupped.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,55,81,76,71,75,114,107,96,31,...,24,24,24,40,10,26,35,10,1,2
1,1,105,160,107,93,111,141,98,114,30,...,30,34,33,47,14,38,44,14,1,2
2,2,6,25,32,21,18,15,27,5,4,...,7,7,7,12,3,7,10,3,2,2
3,3,120,105,110,114,117,78,122,148,39,...,24,24,24,45,15,38,54,15,5,6
4,4,1,12,9,4,8,1,11,12,1,...,3,3,3,4,1,2,3,1,1,1


In [27]:
df_groupped.shape

(250000, 61)

In [25]:
df_groupped.columns

Index(['id', 'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5',
       'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90',
       'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
       'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
       'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
       'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
       'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_paym_21', 

In [13]:
columns_to_encode_2 = list(df_groupped.columns)
columns_to_remove_2 = binary_cols.copy()
columns_to_remove_2.append('id')
for col in columns_to_remove_2:
    columns_to_encode_2.remove(col)

In [14]:
# Теперь кодируем:
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_features = ohe.fit_transform(df_groupped[columns_to_encode_2])
df_encoded_features = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out())
df_exp2 = pd.concat([df_groupped.drop(columns_to_encode_2, axis=1), df_encoded_features], axis=1)
df_exp2.head()

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_account_cur_37,enc_loans_account_cur_38,enc_loans_account_cur_39,enc_loans_account_cur_40,enc_loans_account_cur_41,enc_loans_account_cur_42,enc_loans_account_cur_43,enc_loans_account_cur_44,enc_loans_account_cur_46,enc_loans_account_cur_51
0,0,9,10,10,10,10,6,9,9,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,12,10,12,12,11,10,12,11,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3,2,2,2,3,1,3,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,15,15,15,15,15,8,14,14,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_exp2.shape

(250000, 8696)

In [16]:
df_exp2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Columns: 8696 entries, id to enc_loans_account_cur_51
dtypes: float64(8685), int64(11)
memory usage: 16.2 GB


In [19]:
# Размер файла слишком большой для обработки (более 16 GB). Модель просто не сможет обучиться
# Считаем эксперимент неудачным

In [18]:
# Эксперимент 4. Пробуем сгенерировать новые признаки и оценить изменение метрики
# Поскольку согласно эксперименту 1 наилучшая стратегия агрегирования "по сумме", применим ее.
# Затем добавим дополнительные признаки: для каждого id найдем общее количество кредитных продуктов 
# (агрегация count по столбцу rn), а также сумму значений в столбцах is_zero_loans_5, is_zero_loans_530,
# is_zero_loans_3060, is_zero_loans_6090, is_zero_loans90

In [6]:
df_cut = pd.read_csv('df_cut.csv')
df_cut.head()

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_credit_status_5,enc_loans_credit_status_6,enc_loans_credit_type_1,enc_loans_credit_type_2,enc_loans_credit_type_3,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3
0,0,1,1,1,1,1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0,1,1,1,1,1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0,1,1,1,1,1,0,1,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0,0,1,1,1,1,1,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,1,1,1,1,1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [7]:
df_exp3 = df_cut.groupby(['id'], as_index=False).agg('sum')
df_exp3.shape

(250000, 387)

In [10]:
# Добавляем дополнительный признак: общее количество кредитных продуктов для каждого id
df_exp3['rn_count'] = df_raw_0.groupby(['id'], as_index=False).agg(rn_count=pd.NamedAgg(column='rn', aggfunc='count'))['rn_count']
df_exp3.head()

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_credit_status_6,enc_loans_credit_type_1,enc_loans_credit_type_2,enc_loans_credit_type_3,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,rn_count
0,0,9,10,10,10,10,6,9,9,1,...,0.0,1.0,0.0,2.0,7.0,0.0,10.0,0.0,0.0,10
1,1,12,10,12,12,11,10,12,11,1,...,0.0,3.0,0.0,3.0,8.0,0.0,14.0,0.0,0.0,14
2,2,3,2,2,2,3,1,3,2,2,...,0.0,0.0,0.0,2.0,1.0,0.0,3.0,0.0,0.0,3
3,3,15,15,15,15,15,8,14,14,5,...,0.0,1.0,0.0,4.0,9.0,1.0,15.0,0.0,0.0,15
4,4,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1


In [13]:
df_exp3['is_zero_sum'] = df_exp3.is_zero_loans5 + df_exp3.is_zero_loans530 + df_exp3.is_zero_loans3060 + df_exp3.is_zero_loans6090 + df_exp3.is_zero_loans90
df_exp3.head()

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_credit_type_1,enc_loans_credit_type_2,enc_loans_credit_type_3,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,rn_count,is_zero_sum
0,0,9,10,10,10,10,6,9,9,1,...,1.0,0.0,2.0,7.0,0.0,10.0,0.0,0.0,10,49
1,1,12,10,12,12,11,10,12,11,1,...,3.0,0.0,3.0,8.0,0.0,14.0,0.0,0.0,14,57
2,2,3,2,2,2,3,1,3,2,2,...,0.0,0.0,2.0,1.0,0.0,3.0,0.0,0.0,3,12
3,3,15,15,15,15,15,8,14,14,5,...,1.0,0.0,4.0,9.0,1.0,15.0,0.0,0.0,15,75
4,4,1,1,1,1,1,1,1,1,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,5


In [14]:
# Теперь соединяем получившийся датасет с таргетом и проверяем метрики на разных моделях
df_prepared_exp3 = pd.merge(left=df_exp3, right=targets, on='id', how='inner').drop('id', axis=1)
X = df_prepared_exp3.drop('flag', axis=1)
y = df_prepared_exp3['flag']

In [23]:
df_prepared_exp3.to_csv('df_prepared_exp3.csv')

In [15]:
# Проверяем как влияют изменения на метрику модели логистической регрессии
random_seed = 1
log_model = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=random_seed)
score = cross_val_score(
    log_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.7479, roc_auc_std: 0.0070


In [16]:
# Проверяем как влияют изменения на метрику модели случайного леса
random_seed = 1
forest_model = RandomForestClassifier(class_weight='balanced', random_state=random_seed)
score = cross_val_score(
    forest_model,
    X,
    y,                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

roc_auc_mean: 0.7038, roc_auc_std: 0.0031


In [None]:
# Пробуем модель MLP
RANDOM_SEED=1
MLP_model = MLPClassifier(random_state=RANDOM_SEED, activation='logistic', max_iter=2000, hidden_layer_sizes=(400,), learning_rate_init=0.002)
score = cross_val_score(
    MLP_model,
    X,
    np.ravel(y),                         
    scoring='roc_auc',
    cv=4, 
    n_jobs=-1
)
print(f'roc_auc_mean: {score.mean():.4f}, roc_auc_std: {score.std():.4f}')

In [17]:
# генерирование дополнительных признаков дало небольшой прирост метрики roc auc на всех моделях
# Считаем эксперимент удачным

In [4]:
# На основе проведенных экспериментов, можно сделать вывод о том, что наиболее оптимальный препроцессинг данных 
# будет выглядеть таким образом:
# ohe-кодирование >> группировка с агрегацией по сумме >> генерирование 2х дополнительных признаков

In [None]:
# Теперь подготовим функцию для предобработке всего массива данных 

In [16]:
# путь до данных на компьютере
train_data_path = './train_data/'
processed_data_path = './processed_data/'

In [6]:
columns_to_encode = columns_to_read.copy()
columns_to_remove = binary_cols.copy()
columns_to_remove.append('id')
for col in columns_to_remove:
    columns_to_encode.remove(col)

In [8]:
print(columns_to_read)
print(binary_cols)
print(columns_to_encode)

['id', 'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm', 'pre_fterm', 'pre_till_pclose', 'pre_till_fclose', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90', 'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2', 'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20', 'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24', 'enc_loans_account_holder_type', 'enc_loans_credit_status',

In [8]:
def preprocessing(df):
    # Кодируем выбранные колонки
    ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
    encoded_features = ohe.fit_transform(df[columns_to_encode])
    df_encoded_features = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out())
    # Конкатенируем и удаляем ненужные колонки
    df_encoded = pd.concat([df, df_encoded_features], axis=1).drop(columns_to_encode, axis=1)
    # Группируем с агрегацией по сумме
    df_processed = df_encoded.groupby(['id'], as_index=False).agg('sum')
    # Добавляем признак rn_count
    df_processed['rn_count'] = df.groupby(['id'], as_index=False).agg(
        rn_count=pd.NamedAgg(column='rn', aggfunc='count'))['rn_count']
    # Добавляем признак is_zero_sum
    df_processed['is_zero_sum'] = (
        df_processed.is_zero_loans5 + df_processed.is_zero_loans530 + df_processed.is_zero_loans3060 + \
        df_processed.is_zero_loans6090 + df_processed.is_zero_loans90
    )    
    return df_processed 

In [9]:
# проверим функцию на работоспособность
df = preprocessing(df_raw_0)
df.shape

(250000, 389)

In [10]:
df.head()
# все работает)))

Unnamed: 0,id,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,...,enc_loans_credit_type_1,enc_loans_credit_type_2,enc_loans_credit_type_3,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,rn_count,is_zero_sum
0,0,9,10,10,10,10,6,9,9,1,...,1.0,0.0,2.0,7.0,0.0,10.0,0.0,0.0,10,49
1,1,12,10,12,12,11,10,12,11,1,...,3.0,0.0,3.0,8.0,0.0,14.0,0.0,0.0,14,57
2,2,3,2,2,2,3,1,3,2,2,...,0.0,0.0,2.0,1.0,0.0,3.0,0.0,0.0,3,12
3,3,15,15,15,15,15,8,14,14,5,...,1.0,0.0,4.0,9.0,1.0,15.0,0.0,0.0,15,75
4,4,1,1,1,1,1,1,1,1,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,5


In [13]:
def prepare_transactions_dataset(path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                                 save_to_path=None, verbose: bool=False):
    """
    возвращает готовый pd.DataFrame с признаками, на которых можно учить модель для целевой задачи
    path_to_dataset: str
        путь до датасета с партициями
    num_parts_to_preprocess_at_once: int
        количество партиций, которые будут одновременно держаться и обрабатываться в памяти
    num_parts_total: int
        общее количество партиций, которые нужно обработать
    save_to_path: str
        путь до папки, в которой будет сохранён каждый обработанный блок в .parquet-формате; если None, то не будет сохранён
    verbose: bool
        логирует каждую обрабатываемую часть данных
    """
    preprocessed_frames = []

    for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                                   desc="Transforming transactions data"):
        transactions_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once,
                                                             verbose=verbose)


   #здесь должен быть препроцессинг данных
        transactions_frame = preprocessing(transactions_frame)

   #записываем подготовленные данные в файл
        if save_to_path:
            block_as_str = str(step)
            if len(block_as_str) == 1:
                block_as_str = '00' + block_as_str
            else:
                block_as_str = '0' + block_as_str
            transactions_frame.to_parquet(os.path.join(save_to_path, f'processed_chunk_{block_as_str}.parquet'))

        preprocessed_frames.append(transactions_frame)
        df = pd.concat(preprocessed_frames)
        # Заполняем пропуски значением 0
        df = df.fillna(0)
    return df

In [17]:
df_full = prepare_transactions_dataset(path_to_dataset=train_data_path, num_parts_to_preprocess_at_once=1, num_parts_total=12,
                                 save_to_path=processed_data_path, verbose=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),


Transforming transactions data:   0%|          | 0/12 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_0.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_1.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_10.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_11.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_2.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_3.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_4.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_5.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_6.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_7.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_8.pq
['./train_data/train_data_0.pq', './train_data/train_data_1.pq', './train_data/train_data_10.pq', './train_data/train_data_11.pq', './train_data/train_data_2.pq', './train_data/train_data_3.pq', './train_data/train_data_4.pq', './train_data/train_data_5.pq', './train_data/train_data_6.pq', './train_data/train_data_7.pq', './train_data/train_data_8.pq', './train_data/train_data_9.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path ./train_data/train_data_9.pq


In [18]:
df_full.shape

(3000000, 420)

In [19]:
# Теперь соединяем получившийся датасет с таргетом и проверяем метрики на разных моделях
df_prepared = pd.merge(left=df_full, right=targets, on='id', how='inner').drop('id', axis=1)

In [20]:
df_prepared.shape

(3000000, 420)

In [21]:
df_prepared.head()

Unnamed: 0,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,fclose_flag,...,pre_loans530_9,pre_loans_total_overdue_1,pre_loans_max_overdue_sum_1,pre_loans3060_1,pre_loans3060_3,pre_loans3060_4,pre_loans6090_1,pre_loans5_10,pre_loans530_17,flag
0,9,10,10,10,10,6,9,9,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,12,10,12,12,11,10,12,11,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3,2,2,2,3,1,3,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,15,15,15,15,15,8,14,14,5,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [23]:
# Сохраним датафрейм в файл
df_prepared.to_parquet('./processed_data/df_prepared.parquet')

In [4]:
df_prepared = pd.read_parquet('./processed_data/df_prepared.parquet', engine='fastparquet')

## Modeling

In [6]:
# Разделим датафрейм на тренировочную и тестовую выборку
df_train, df_test = train_test_split(df_prepared, test_size=0.2, stratify=df_prepared['flag'])

In [7]:
X_train, y_train = df_train.drop(['flag'], axis=1), df_train['flag']
X_test, y_test = df_test.drop(['flag'], axis=1), df_test['flag']

In [3]:
# Подбираем гиперпараметры для LogisticRegression. Применим GridSearchCV
# Инициализируем сетку параметров для перебора
param_grid = {    
    'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]    
}
# Инициализируем базовую модель
random_seed = 1
log_model = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=random_seed)
# Модель для перебора параметров базовой модели
grid_search_log = GridSearchCV(
    estimator=log_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=4,    
    n_jobs=-1
)
# Выполняем перебор параметров
grid_search_log.fit(X_train, y_train)
# Выводим лучшие параметры и метрику
best_params = grid_search_log.best_params_
best_score = grid_search_log.best_score_
print(best_params)
print(best_score)

MemoryError: Unable to allocate 1.82 GiB for an array with shape (407, 600000) and data type float64

In [2]:
# Памяти не хватает. Проведем подбор гиперпараметров на 2 батчах от общей выборки т.е. на первых 500000 записях
df = pd.read_parquet('./processed_data/df_prepared.parquet', engine='fastparquet')
df_gridsearch = df.iloc[0:500000, :]
X_gridsearch = df_gridsearch.drop('flag', axis=1)
y_gridsearch = df_gridsearch['flag']

In [3]:
X_gridsearch.to_parquet('./processed_data/X_gridsearch.parquet')
y_gridsearch.to_csv('./processed_data/y_gridsearch.csv', index=False)

In [2]:
X_gridsearch = pd.read_parquet('./processed_data/X_gridsearch.parquet', engine='fastparquet')
y_gridsearch = pd.read_csv('./processed_data/y_gridsearch.csv')

In [None]:
X_gridsearch = pd.read_parquet('X_gridsearch.parquet', engine='fastparquet')
y_gridsearch = pd.read_csv('y_gridsearch.csv')

In [3]:
# Подбираем гиперпараметры для LogisticRegression. Применим GridSearchCV
# Инициализируем сетку параметров для перебора
param_grid = {    
    'C': [0.05, 0.1, 0.5, 1.0, 5.0, 10.0]    
}
# Инициализируем базовую модель
random_seed = 1
log_model = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=random_seed)
# Модель для перебора параметров базовой модели
grid_search_log = GridSearchCV(
    estimator=log_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=4,    
    n_jobs=-1
)
# Выполняем перебор параметров
grid_search_log.fit(X_gridsearch, np.ravel(y_gridsearch))
# Выводим лучшие параметры и метрику
best_params = grid_search_log.best_params_
best_score = grid_search_log.best_score_
print(best_params)
print(best_score)

  y = column_or_1d(y, warn=True)


{'C': 0.1}
0.7543424759780092


In [3]:
# Подбираем гиперпараметры для RandomForestClassifier. Применим GridSearchCV
# Инициализируем сетку параметров для перебора
params = {    
    'n_estimators': list(range(10, 311, 50)),
    'max_depth': list(range(4, 21, 4))
}
# Инициализируем базовую модель
random_seed = 1
forest_model = RandomForestClassifier(class_weight='balanced', random_state=random_seed)
# Модель для перебора параметров базовой модели
grid_search_forest = GridSearchCV(
    estimator=forest_model,
    param_grid=params,x
    scoring='roc_auc',
    cv=4,    
    n_jobs=-1
)
# Выполняем перебор параметров
grid_search_forest.fit(X_gridsearch, np.ravel(y_gridsearch))
# Выводим лучшие параметры и метрику
best_params = grid_search_forest.best_params_
best_score = grid_search_forest.best_score_
print(best_params)
print(best_score)

  return fit_method(estimator, *args, **kwargs)


{'max_depth': 10, 'n_estimators': 141}
0.7390067783226267


In [None]:
# Подбираем дополнительные гиперпараметры для MLPClassifier. Также применим GridSearchCV
# Инициализируем сетку параметров для перебора
params = {    
    'activation': ['logistic', 'relu'],    
    'hidden_layer_sizes': [(450,), (500,), (450, 200)]    
}
# Инициализируем базовую модель
RANDOM_SEED=1
MLP_model = MLPClassifier(random_state=RANDOM_SEED, max_iter=3000, learning_rate_init=0.002)
# Модель для перебора параметров базовой модели
grid_search_MLP = GridSearchCV(
    estimator=MLP_model,
    param_grid=params,
    scoring='roc_auc',
    cv=4,    
    n_jobs=-1
)
# Выполняем перебор параметров
grid_search_MLP.fit(X_gridsearch, np.ravel(y_gridsearch))
# Выводим лучшие параметры и метрику
best_params = grid_search_MLP.best_params_
best_score = grid_search_MLP.best_score_
print(best_params)
print(best_score)

In [6]:
# Подбор параметров в ячейке выше проводился в Yandex Datasphere
# Результат следующий:
# {'activation': 'relu', 'hidden_layer_sizes': (450,)}
# 0.6768966220504706

In [1]:
# Наилучшие показатели показала модель логистической регрессии с параметром С 0,05
# Теперь проверим метрику данной модели по кросс-валидации на тренировочной выборке, а затем посмотрим метрику roc_auc на 
# тестовой выборке

In [2]:
df_prepared = pd.read_parquet('./processed_data/df_prepared.parquet', engine='fastparquet')

In [4]:
# Прежде, чем проводить дальнейшее моделирование, попробуем уменьшить объем данных, который занимает датасет,
# чтобы быстрее проводились расчеты
df_prepared.head()

Unnamed: 0,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,fclose_flag,...,pre_loans530_9,pre_loans_total_overdue_1,pre_loans_max_overdue_sum_1,pre_loans3060_1,pre_loans3060_3,pre_loans3060_4,pre_loans6090_1,pre_loans5_10,pre_loans530_17,flag
0,9,10,10,10,10,6,9,9,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,12,10,12,12,11,10,12,11,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3,2,2,2,3,1,3,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,15,15,15,15,15,8,14,14,5,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
df_prepared.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Columns: 420 entries, is_zero_loans5 to flag
dtypes: float64(407), int64(13)
memory usage: 9.4 GB


In [7]:
max(list(dict(df_prepared.max()).values()))

290.0

In [8]:
# Самое большое число в датасете - 290 и в int8 оно не влйдет. Применим int16
df_prepared_int = df_prepared.astype(dtype='int16')

In [9]:
max(list(dict(df_prepared_int.max()).values()))

290

In [8]:
df_prepared_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Columns: 420 entries, is_zero_loans5 to flag
dtypes: int16(420)
memory usage: 2.3 GB


In [10]:
df_prepared_int.head()

Unnamed: 0,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,fclose_flag,...,pre_loans530_9,pre_loans_total_overdue_1,pre_loans_max_overdue_sum_1,pre_loans3060_1,pre_loans3060_3,pre_loans3060_4,pre_loans6090_1,pre_loans5_10,pre_loans530_17,flag
0,9,10,10,10,10,6,9,9,1,2,...,0,0,0,0,0,0,0,0,0,0
1,12,10,12,12,11,10,12,11,1,2,...,0,0,0,0,0,0,0,0,0,0
2,3,2,2,2,3,1,3,2,2,2,...,0,0,0,0,0,0,0,0,0,0
3,15,15,15,15,15,8,14,14,5,6,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Разделим выборку на тренировочную и тестовую
df_train, df_test = train_test_split(df_prepared_int, test_size=0.2, stratify=df_prepared['flag'])

In [12]:
# Выделим целевую переменную
X_train, y_train = df_train.drop(['flag'], axis=1), df_train['flag']
X_test, y_test = df_test.drop(['flag'], axis=1), df_test['flag']

In [13]:
# Сохраним датафреймы в файлы
X_train.to_parquet('./processed_data/X_train.parquet', index=False)
X_test.to_parquet('./processed_data/X_test.parquet', index=False)
y_train.to_csv('./processed_data/y_train.csv', index=False)
y_test.to_csv('./processed_data/y_test.csv', index=False)

In [3]:
X_train = pd.read_parquet('./processed_data/X_train.parquet', engine='fastparquet')
y_train = pd.read_csv('./processed_data/y_train.csv')
X_test = pd.read_parquet('X_test.parquet', engine='fastparquet')
y_test = pd.read_csv('y_test.csv')

In [14]:
X_train.head()

Unnamed: 0,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,fclose_flag,...,pre_loans530_8,pre_loans530_9,pre_loans_total_overdue_1,pre_loans_max_overdue_sum_1,pre_loans3060_1,pre_loans3060_3,pre_loans3060_4,pre_loans6090_1,pre_loans5_10,pre_loans530_17
2735600,11,11,11,11,11,8,10,10,4,3,...,0,0,0,0,0,0,0,0,0,0
1486179,5,2,4,3,6,6,7,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2854355,2,1,2,2,2,1,2,1,1,2,...,0,0,0,0,0,0,0,0,0,0
381029,19,18,19,19,19,19,19,18,3,3,...,0,0,0,0,0,0,0,0,0,0
2530088,11,11,11,11,11,7,9,9,5,3,...,0,0,0,0,0,0,0,0,0,0


In [106]:
#!c1.32
# Проводим кросс-валидацию на модели логистической регрессии (вычисления проводились в Yandex Datasphere)
random_seed = 1
log_model = LogisticRegression(class_weight='balanced', max_iter=3000, C=0.1, random_state=random_seed)
metric = cross_val_score(log_model, X_train, y_train, cv=4, n_jobs=-1)
print(metric.mean())
print(metric.std())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.6566295833333333
0.001034540541469723


In [111]:
#!c1.8
# Проводим кросс-валидацию на модели случайного леса (Вычисления проводились в yandex datasphere)
random_seed = 1
forest_model = RandomForestClassifier(
    class_weight='balanced', 
    random_state=random_seed, 
    max_depth=10, 
    n_estimators=250    
)
metric = cross_val_score(forest_model, X_train, np.ravel(y_train), cv=4, n_jobs=-1)
print(metric.mean())
print(metric.std())

0.7039554166666667
0.0014385208331824735


In [113]:
#!c1.8
# Проводим кросс-валидацию на модели многослойного персептрона (Вычисления проводились в yandex datasphere)
random_seed = 1
MLP_model = MLPClassifier(
    random_state=random_seed,
    max_iter=3000,
    learning_rate_init=0.002,
    activation='relu',
    hidden_layer_sizes=(450,)
)
metric = cross_val_score(MLP_model, X_train, np.ravel(y_train), cv=4, n_jobs=-1)
print(metric.mean())
print(metric.std())

0.96449125
1.8309643785599802e-05


In [120]:
#!c1.8
# Наилучший результат показала модель многослойного персептрона
# Обучим модель на тренировочной выборке и проверим метрику roc_auc (Вычисления проводились в yandex datasphere)
MLP_model = MLPClassifier(
MLP_model = MLPClassifier(
    random_state=random_seed,
    max_iter=3000,
    learning_rate_init=0.002,
    activation='relu',
    hidden_layer_sizes=(450,)
)
MLP_model.fit(X_train, np.ravel(y_train))
roc_auc = roc_auc_score(y_test, MLP_model.predict_proba(X_test)[:, 1])
print(roc_auc)

0.7562898519076008


In [1]:
# Метрика по roc_auc показывает удовлетворительные результаты. Однако имеет место переобучение модели, поэтому 
# попробуем применить другие модели. Будем использовать модель lightgbm

In [207]:
#!c1.8
# Создаем объект класса lightgbm.LGBMClassifier
lgbm_model = lightgbm.LGBMClassifier(
              seed=1,
              n_jobs=-1,
              is_unbalance='true',
              objective='binary',
              n_estimators=3000,
              learning_rate=0.05,
              max_depth=10,
              metric='auc',
              boosting_type='gbdt',
              num_leaves=62,
              verbose=-1
)

In [234]:
#!c1.8
# Тренируем модель
lgbm_model.fit(
    X_train, np.ravel(y_train), 
    eval_set=[(X_test, np.ravel(y_test))], 
    callbacks=[lightgbm.early_stopping(stopping_rounds=50), lightgbm.log_evaluation()]
)

[1]	valid_0's auc: 0.697008
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.709244
[3]	valid_0's auc: 0.716019
[4]	valid_0's auc: 0.720355
[5]	valid_0's auc: 0.722618
[6]	valid_0's auc: 0.724644
[7]	valid_0's auc: 0.726139
[8]	valid_0's auc: 0.727789
[9]	valid_0's auc: 0.728802
[10]	valid_0's auc: 0.729821
[11]	valid_0's auc: 0.730725
[12]	valid_0's auc: 0.731648
[13]	valid_0's auc: 0.732515
[14]	valid_0's auc: 0.732924
[15]	valid_0's auc: 0.733389
[16]	valid_0's auc: 0.733977
[17]	valid_0's auc: 0.734541
[18]	valid_0's auc: 0.734967
[19]	valid_0's auc: 0.735456
[20]	valid_0's auc: 0.735978
[21]	valid_0's auc: 0.736397
[22]	valid_0's auc: 0.736732
[23]	valid_0's auc: 0.737011
[24]	valid_0's auc: 0.737391
[25]	valid_0's auc: 0.737625
[26]	valid_0's auc: 0.738106
[27]	valid_0's auc: 0.738427
[28]	valid_0's auc: 0.738675
[29]	valid_0's auc: 0.739014
[30]	valid_0's auc: 0.739239
[31]	valid_0's auc: 0.739643
[32]	valid_0's auc: 0.739964
[33]	valid_0's auc: 

In [233]:
#!c1.8
# Выводим получившиеся метрики
y_train_pred = lgbm_model.predict_proba(X_train)[:, 1]
y_test_pred = lgbm_model.predict_proba(X_test)[:, 1]

print("AUC Train: {:.4f}\nAUC Test: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.7910
AUC Test: 0.7583


In [200]:
#!c1.8
# Попробуем улучшить метрику. Удалим сгенерированные ранее признаки и проверим, как это повлияет на метрику
X_train_short = X_train.drop(['rn_count', 'is_zero_sum'], axis=1)
X_test_short = X_test.drop(['rn_count', 'is_zero_sum'], axis=1)

In [208]:
#!c1.8
# Обучаем модель на 'укороченных' данных
lgbm_model.fit(
    X_train_short, np.ravel(y_train), 
    eval_set=[(X_test_short, np.ravel(y_test))], 
    callbacks=[lightgbm.early_stopping(stopping_rounds=50), lightgbm.log_evaluation()]
)

[1]	valid_0's auc: 0.697025
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.709134
[3]	valid_0's auc: 0.715668
[4]	valid_0's auc: 0.719302
[5]	valid_0's auc: 0.721814
[6]	valid_0's auc: 0.724133
[7]	valid_0's auc: 0.725441
[8]	valid_0's auc: 0.727385
[9]	valid_0's auc: 0.728537
[10]	valid_0's auc: 0.729708
[11]	valid_0's auc: 0.73081
[12]	valid_0's auc: 0.731468
[13]	valid_0's auc: 0.732251
[14]	valid_0's auc: 0.73268
[15]	valid_0's auc: 0.733333
[16]	valid_0's auc: 0.733955
[17]	valid_0's auc: 0.734567
[18]	valid_0's auc: 0.735024
[19]	valid_0's auc: 0.735495
[20]	valid_0's auc: 0.735812
[21]	valid_0's auc: 0.736165
[22]	valid_0's auc: 0.736652
[23]	valid_0's auc: 0.736973
[24]	valid_0's auc: 0.737394
[25]	valid_0's auc: 0.737874
[26]	valid_0's auc: 0.738155
[27]	valid_0's auc: 0.738552
[28]	valid_0's auc: 0.738939
[29]	valid_0's auc: 0.739118
[30]	valid_0's auc: 0.739455
[31]	valid_0's auc: 0.739805
[32]	valid_0's auc: 0.740086
[33]	valid_0's auc: 0.

In [209]:
#!c1.8
# Выводим метрику на печать
y_train_pred = lgbm_model.predict_proba(X_train_short)[:, 1]
y_test_pred = lgbm_model.predict_proba(X_test_short)[:, 1]

print("AUC Train: {:.4f}\nAUC Test: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.7961
AUC Test: 0.7589


In [2]:
# Метрика улучшилась. Кроме того, overfitting гораздо ниже, чем на модели многослойного персептрона. 
# Еще одним преимуществом данной модели является скорость обучения, которая значительно выше, чем скорость обучения
# многослойного персептрона

# Results

In [3]:
# 1. Наиболее опимальным препроцессингом данных является кодирование категориальных переменных с последующей агрегацией 
# по сумме.
# 2. Наиболее результативной моделью является lightgbm.LGBMClassifier.
# 3. Наилучшая метрика, которую удалось достичь, составляет 0.7589