# Импорт

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# тренировочные данные
train_card_spending_df = pd.read_parquet('input_data/train_card_spending_df.parquet')
train_main_df = pd.read_parquet('input_data/train_main_df.parquet')
train_mcc_operations_df = pd.read_parquet('input_data/train_mcc_operations_df.parquet')
train_mcc_preferences_df = pd.read_parquet('input_data/train_mcc_preferences_df.parquet')
train_target = pd.read_csv('input_data/train_target.csv')

In [3]:
# Т.к. порядок id похоже совпадает в train_target и train_main_df и других таблицах, продублируем index в отдельный столбец для merge
# будем производить merge по двум колонкам для избежания новых фейковых данных
train_target['index'] = train_target.index
train_main_df['index'] = train_main_df.index
train_card_spending_df['index'] = train_card_spending_df.index
train_mcc_operations_df['index'] = train_mcc_operations_df.index
train_mcc_preferences_df['index'] = train_mcc_preferences_df.index

train_full_info = pd.merge(train_target, train_main_df, on=['id', 'index'])

In [4]:
train_card_spending_df = pd.merge(train_target, train_card_spending_df, on=['id', 'index'])
train_mcc_operations_df = pd.merge(train_target, train_mcc_operations_df, on=['id', 'index'])
train_mcc_preferences_df = pd.merge(train_target, train_mcc_preferences_df, on=['id', 'index'])

# Функции

In [5]:
def all_nans_fullfilling(train_full_info):
    # заполнение nan в колонке происхождения авто и степени БУ авто
    #train_full_info['vehicle_counrty_type_nm'] = train_full_info['vehicle_counrty_type_nm'].fillna(0)
    #train_full_info['used_car_flg'] = train_full_info['used_car_flg'].fillna(0)
    #train_full_info['app_vehicle_ind'] = train_full_info['app_vehicle_ind'].fillna(0)
    train_full_info = train_full_info.drop(['vehicle_counrty_type_nm', 'used_car_flg', 'app_vehicle_ind'], axis=1)
    
    # идея заполнения однотипных колонок по маске
    mask_avg_dep = train_full_info.columns.str.startswith('avg_dep_avg_balance_fact_')
    mask_zp = train_full_info.columns.str.startswith('zp_')
    mask_agg = train_full_info.columns.str.startswith(('max_', 'min_', 'avg_', 'sum_'))
    mask_dep = train_full_info.columns.str.startswith('dep_')
    mask_income = train_full_info.columns.str.startswith('income_')
    mask_cnt = train_full_info.columns.str.startswith('cnt_') & ~train_full_info.columns.isin(['cnt_prolong_max', 'cnt_prolong_max_5y'])
    # заполнение медианой колонок из списка avg_dep_avg_balance_fact_...
    cols_to_fill = train_full_info.columns[mask_avg_dep]
    train_full_info[cols_to_fill] = train_full_info[cols_to_fill].fillna(train_full_info[cols_to_fill].median())
    # заполнение медианой колонок из списка zp_...
    cols_to_fill = train_full_info.columns[mask_zp]
    train_full_info[cols_to_fill] = train_full_info[cols_to_fill].fillna(train_full_info[cols_to_fill].median())
    # заполнение медианой колонок из списка max_..., min_..., avg_..., sum_...
    cols_to_fill = train_full_info.columns[mask_agg]
    train_full_info[cols_to_fill] = train_full_info[cols_to_fill].fillna(train_full_info[cols_to_fill].median())
    # заполнение медианой колонок из списка dep_...
    cols_to_fill = train_full_info.columns[mask_dep]
    train_full_info[cols_to_fill] = train_full_info[cols_to_fill].fillna(train_full_info[cols_to_fill].median())
    # заполнение медианой колонок из списка income_...
    cols_to_fill = train_full_info.columns[mask_income]
    train_full_info[cols_to_fill] = train_full_info[cols_to_fill].fillna(train_full_info[cols_to_fill].median())
    # заполнение медианой колонок из списка cnt_...
    cols_to_fill = train_full_info.columns[mask_cnt]
    train_full_info[cols_to_fill] = train_full_info[cols_to_fill].fillna(train_full_info[cols_to_fill].median())
    
    
    # вывод: требуется написание функции, для cnt_prolong_max которая бы ставила 0, 1, 2 на основании значения
    # dep_max_d_term (макс срок срочного вклада) и max_term (макс срок договора) для cnt_prolong_max
    # А cnt_prolong_max_5y обработать на основании cnt_prolong_max
    # Определяем границы интервалов и соответствующие метки
    bins = [0, 7, 12, 31, 32, 33, 41, 51, 58, 61, 63, 75, 80, float('inf')]
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    # Создаем категории для max_term
    train_full_info['max_term_category'] = pd.cut(
        train_full_info['max_term'],
        bins=bins,
        labels=labels,
        right=False
    )
    # Вычисляем медианные значения cnt_prolong_max для каждой категории
    median_values = train_full_info.groupby('max_term_category')['cnt_prolong_max'].median()
    # Заполняем пропуски в cnt_prolong_max на основе категорий max_term
    train_full_info['cnt_prolong_max'] = train_full_info['cnt_prolong_max'].fillna(
        train_full_info['max_term_category'].map(median_values))
    # Удаляем временную колонку (опционально)
    train_full_info = train_full_info.drop('max_term_category', axis=1)

    # Обработка 'cnt_prolong_max_5y'
    #train_full_info['cnt_prolong_max_5y'] = train_full_info['cnt_prolong_max_5y'].fillna(0)
    train_full_info = train_full_info.drop('cnt_prolong_max_5y', axis=1)
    
    
    # Вывод: app_income_app = mean(income_verified, income_verified_primary_job) + income_unverified
    func = lambda x: (x['income_verified'] + x['income_verified_primary_job']) / 2 + x['income_unverified']
    train_full_info['app_income_app'] = train_full_info.apply(func, axis=1)
    
    # Заполнение app_real_estate_ind (наличие недвижимости)
    #train_full_info['app_real_estate_ind'] = train_full_info['app_real_estate_ind'].fillna('0')
    train_full_info = train_full_info.drop('app_real_estate_ind', axis=1) 
    
    
    # заполняем nan в детях, иждвенцах и семье
    #train_full_info['app_children_cnt'] = train_full_info['app_children_cnt'].fillna(0)
    #train_full_info['app_dependent_cnt'] = train_full_info['app_dependent_cnt'].fillna(0)
    #filling = train_full_info['app_children_cnt'] + train_full_info['app_dependent_cnt']
    #train_full_info['app_family_cnt'] = train_full_info['app_family_cnt'].fillna(filling)
    train_full_info = train_full_info.drop(['app_children_cnt', 'app_dependent_cnt', 'app_family_cnt'], axis=1)

    return train_full_info

In [6]:
from collections import defaultdict

def filter_by_variance(df, threshold=0.01, exclude_columns=None):
    """
    Удаляет колонки с дисперсией ниже заданного порога.
    
    Параметры:
    - df: исходный DataFrame
    - threshold: порог дисперсии (по умолчанию 0.01)
    - exclude_columns: список колонок, которые не нужно удалять (например, целевая переменная)
    
    Возвращает:
    - DataFrame без колонок с низкой дисперсией
    """
    if exclude_columns is None:
        exclude_columns = []
    
    # Выбираем только числовые колонки, исключая указанные
    numeric_cols = df.select_dtypes(include=['number']).columns.difference(exclude_columns)
    
    # Вычисляем дисперсию
    variances = df[numeric_cols].var()
    
    # Колонки для удаления
    low_variance_cols = variances[variances < threshold].index.tolist()
    
    # Удаляем колонки с низкой дисперсией
    return df.drop(columns=low_variance_cols)

def filter_by_low_correlation(df, target_col=['id', 'index', 'target'], threshold=0.01):
    """
    Оставляет только колонки с корреляцией (по модулю) выше порога с целевой переменной.
    
    Параметры:
    - df: исходный DataFrame
    - target_col: название целевой колонки
    - threshold: порог корреляции (по умолчанию 0.4)
    
    Возвращает:
    - DataFrame с колонками, имеющими значимую корреляцию с target_col
    """
    
    # Вычисляем корреляцию только для числовых колонок
    numeric_df = df.select_dtypes(include=['number'])
    corr_matrix = numeric_df.corr()
    
    # Получаем корреляцию с целевой переменной
    target_corr = corr_matrix.loc[target_col, :]
    
    # Выбираем колонки с корреляцией выше порога (по модулю)
    significant_cols = target_corr.columns[(target_corr.abs() > threshold).any(axis=0)]

    
    # Оставляем только значимые колонки
    return df[significant_cols]

def remove_temporal_duplicates(df, target_col=None):
    """
    Удаляет временные дубликаты колонок с особым правилом для _now, _1m, _12m:
    - Всегда оставляет _now, _1m и _12m, если они существуют
    - Для остальных временных периодов применяет стандартные правила приоритета
    
    Параметры:
    - df: исходный DataFrame
    - target_col: название колонки, которую нужно сохранить
    
    Возвращает:
    - DataFrame с сохраненными временными колонками
    """
    from collections import defaultdict
    import re
    
    col_groups = defaultdict(list)
    temporal_pattern = re.compile(r'(_now|_\d{1,2}m|_\d{1,2}|_7d)$')
    
    # Группируем колонки по базовым названиям
    for col in df.columns:
        if target_col and col == target_col:
            continue  # Целевую колонку не трогаем
            
        # Ищем временной суффикс
        match = temporal_pattern.search(col)
        if match:
            suffix = match.group()
            base_col = col[:-len(suffix)]
            col_groups[base_col].append((suffix, col))
        else:
            # Колонки без временного суффикса оставляем как есть
            col_groups[col].append(('', col))
    
    cols_to_keep = set()
    cols_to_remove = set()
    
    # Определяем приоритетные суффиксы (которые оставляем все)
    keep_all_suffixes = ['_now', '_1m', '_12m']
    # Стандартные приоритеты для остальных случаев
    standard_priority = ['_now', '_1m', '_1', '_12m', '_12', '_2m', '_2', 
                        '_3m', '_3', '_6m', '_6', '_9m', '_9', '_7d']
    
    for base_col, variants in col_groups.items():
        if len(variants) > 1:  # Если есть временные варианты
            # Проверяем наличие специальных суффиксов
            special_variants = [(suf, col) for suf, col in variants if suf in keep_all_suffixes]
            
            if len(special_variants) >= 1:  # Если есть хотя бы один специальный суффикс
                # Оставляем все специальные суффиксы
                for suf, col in special_variants:
                    cols_to_keep.add(col)
                
                # Удаляем остальные временные варианты для этой группы
                for suf, col in variants:
                    if suf not in keep_all_suffixes:
                        cols_to_remove.add(col)
            else:
                # Применяем стандартные правила приоритета
                sorted_variants = sorted(
                    variants,
                    key=lambda x: standard_priority.index(x[0]) if x[0] in standard_priority 
                    else len(standard_priority)
                )
                # Оставляем колонку с наивысшим приоритетом
                cols_to_keep.add(sorted_variants[0][1])
                # Остальные добавляем на удаление
                for suf, col in sorted_variants[1:]:
                    cols_to_remove.add(col)
    
    # Удаляем дубликаты
    result_df = df.drop(columns=cols_to_remove)
    
    return result_df


def preprocess_data(df, mode='both', nan_threshold=0.7):
    """
    Удаляет колонки с долей пропусков (NaN) выше заданного порога.
    
    Параметры:
        df: Исходный DataFrame.
        mode: Режим обработки:
            - 'both' (по умолчанию): все колонки.
            - 'numeric': только числовые колонки.
            - 'categorical': только категориальные колонки.
        nan_threshold: Доля пропусков для удаления колонки (0.7 = 70%).
    
    Возвращает:
        DataFrame с удаленными колонками.
    """
    # Определяем типы колонок
    numeric_cols = df.select_dtypes(include=['int', 'float']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns
    
    # Выбираем колонки для обработки в зависимости от режима
    if mode == 'numeric':
        cols_to_check = numeric_cols
    elif mode == 'categorical':
        cols_to_check = categorical_cols
    else:  # 'both'
        cols_to_check = df.columns
    
    # Вычисляем порог для удаления
    threshold = len(df) * nan_threshold
    
    # Удаляем колонки с пропусками > порога
    cols_to_drop = [col for col in cols_to_check if df[col].isnull().sum() > threshold]
    df = df.drop(columns=cols_to_drop)
    
    return df


def remove_correlated_features(df, target_col='target', threshold=0.5):
    """
    Удаляет коррелирующие признаки, сохраняя целевую переменную.
    
    Параметры:
        df: Исходный датафрейм.
        target_col: Название столбца с целевой переменной (не удаляется).
        threshold: Порог корреляции для удаления (по умолчанию 0.5).
    
    Возвращает:
        Датафрейм с некоррелирующими признаками и целевой переменной.
    """
    # Отделяем целевую переменную и признаки
    target = df[target_col]
    features = df[df.select_dtypes(include=['number']).columns]
    features = features.drop(columns=[target_col])
    
    # Вычисляем матрицу корреляций
    corr_matrix = features.corr().abs()
    
    # Верхний треугольник матрицы (чтобы не дублировать пары)
    upper = corr_matrix.where(np.triu(np.ones_like(corr_matrix, dtype=bool), k=1))
    
    # Находим признаки для удаления (корреляция > threshold)
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # Удаляем коррелирующие признаки и возвращаем целевой столбец
    filtered_features = features.drop(columns=to_drop)
    return pd.concat([filtered_features, target], axis=1)

    
# Пример использования:
# фильтруем по дисперсии (сохраняя целевую переменную)
# df_filtered = filter_by_variance(train_full_info, threshold=0.01, exclude_columns=['target'])

# фильтруем по корреляции
# df_filtered = filter_by_low_correlation(df_filtered, target_col='target', threshold=0.4)

# удаляем временные дубликаты
# df_final = remove_temporal_duplicates(df_filtered, target_col='target')

# удаляем колонки с большим кол-во Nan
# preprocess_data(df)

# удаляем то, что сильно коррелирует между собой
# remove_correlated_features(df)

# Основной датасет

In [7]:
df = train_full_info.copy()
df = all_nans_fullfilling(df)

df

  train_full_info['max_term_category'] = pd.cut(
  median_values = train_full_info.groupby('max_term_category')['cnt_prolong_max'].median()


Unnamed: 0,target,id,index,app_income_app,avg_dep_avg_balance_12month_amt,avg_dep_avg_balance_12month_amt_term,avg_dep_avg_balance_12month_amt_term_savings,avg_dep_avg_balance_1month_amt,avg_dep_avg_balance_1month_amt_term,avg_dep_avg_balance_1month_amt_term_savings,...,savings_sum_oms_debet_3m,savings_sum_oms_debet_6m,savings_sum_oms_debet_9m,savings_sum_oms_debet_12m,savings_service_model_cd,savings_pension_flg,savings_deposit_flg,savings_safe_acc_flg,savings_broker_flg,savings_oms_flg
0,0.000000,97678374,0,71867.645241,121030.757812,252079.132812,4027.373535,8.947612e+04,207176.398438,9910.662109,...,34.613216,0.000000,4.310414,75.214180,Массовый,0,0,1,0,0
1,0.000000,62472650,1,71867.645241,121030.757812,252079.132812,132074.539062,8.947612e+04,207176.398438,91689.093750,...,6.237672,0.000000,0.000000,0.000000,Массовый,0,0,1,0,0
2,219932.906250,94308112,2,105309.652023,315208.781250,252079.132812,274816.375000,2.356619e+05,207176.398438,225698.500000,...,0.000000,53.134129,90.025238,0.000000,Массовый,0,0,1,0,0
3,631.770020,68994873,3,339.378130,43187.953125,252079.132812,5277.233887,2.844051e+04,207176.398438,0.000000,...,0.000000,22.276114,82.070015,117.386795,Массовый,0,0,1,0,0
4,0.000000,78127603,4,71867.645241,121030.757812,252079.132812,0.000000,8.947612e+04,207176.398438,0.000000,...,0.000000,0.000000,0.000000,33.072178,Массовый,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85584,0.000000,6163462,85584,50582.660400,14605.051758,252079.132812,6170.456055,6.814772e+03,207176.398438,0.000000,...,36.119827,0.000000,73.875885,0.000000,Массовый,0,0,1,0,0
85585,841071.000000,22320709,85585,413239.799133,510666.125000,252079.132812,397536.968750,1.012610e+06,207176.398438,850407.312500,...,0.000000,0.000000,0.000000,2.739857,Привилегия,0,0,1,0,0
85586,27.990000,23863045,85586,76633.474609,4456.615723,252079.132812,0.000000,0.000000e+00,207176.398438,8278.396484,...,0.000000,0.000000,50.728161,0.000000,Массовый,0,0,1,0,0
85587,0.000000,33408678,85587,71867.645241,3737.191895,252079.132812,433.325165,8.750564e+04,207176.398438,79891.273438,...,41.665630,0.000000,0.000000,99.994492,Массовый,0,0,1,0,0


In [8]:
df = filter_by_variance(df)

In [9]:
#df = remove_temporal_duplicates(df)

In [10]:
#df = remove_correlated_features(df)

In [11]:
#df = filter_by_low_correlation(df, threshold=0.01)

In [12]:
train_full_info = df.copy()
train_full_info

Unnamed: 0,target,id,index,app_income_app,avg_dep_avg_balance_12month_amt,avg_dep_avg_balance_12month_amt_term,avg_dep_avg_balance_12month_amt_term_savings,avg_dep_avg_balance_1month_amt,avg_dep_avg_balance_1month_amt_term,avg_dep_avg_balance_1month_amt_term_savings,...,savings_sum_oms_credit_12m,savings_sum_oms_debet_1m,savings_sum_oms_debet_3m,savings_sum_oms_debet_6m,savings_sum_oms_debet_9m,savings_sum_oms_debet_12m,savings_service_model_cd,savings_pension_flg,savings_deposit_flg,savings_broker_flg
0,0.000000,97678374,0,71867.645241,121030.757812,252079.132812,4027.373535,8.947612e+04,207176.398438,9910.662109,...,0.000000,0.000000,34.613216,0.000000,4.310414,75.214180,Массовый,0,0,0
1,0.000000,62472650,1,71867.645241,121030.757812,252079.132812,132074.539062,8.947612e+04,207176.398438,91689.093750,...,0.000000,10.644595,6.237672,0.000000,0.000000,0.000000,Массовый,0,0,0
2,219932.906250,94308112,2,105309.652023,315208.781250,252079.132812,274816.375000,2.356619e+05,207176.398438,225698.500000,...,0.000000,0.000000,0.000000,53.134129,90.025238,0.000000,Массовый,0,0,0
3,631.770020,68994873,3,339.378130,43187.953125,252079.132812,5277.233887,2.844051e+04,207176.398438,0.000000,...,0.000000,0.933353,0.000000,22.276114,82.070015,117.386795,Массовый,0,0,0
4,0.000000,78127603,4,71867.645241,121030.757812,252079.132812,0.000000,8.947612e+04,207176.398438,0.000000,...,126.881142,2.128708,0.000000,0.000000,0.000000,33.072178,Массовый,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85584,0.000000,6163462,85584,50582.660400,14605.051758,252079.132812,6170.456055,6.814772e+03,207176.398438,0.000000,...,0.000000,0.000000,36.119827,0.000000,73.875885,0.000000,Массовый,0,0,0
85585,841071.000000,22320709,85585,413239.799133,510666.125000,252079.132812,397536.968750,1.012610e+06,207176.398438,850407.312500,...,0.000000,1.143665,0.000000,0.000000,0.000000,2.739857,Привилегия,0,0,0
85586,27.990000,23863045,85586,76633.474609,4456.615723,252079.132812,0.000000,0.000000e+00,207176.398438,8278.396484,...,20.158716,6.629341,0.000000,0.000000,50.728161,0.000000,Массовый,0,0,0
85587,0.000000,33408678,85587,71867.645241,3737.191895,252079.132812,433.325165,8.750564e+04,207176.398438,79891.273438,...,27.778503,9.421430,41.665630,0.000000,0.000000,99.994492,Массовый,0,0,0


# Доп таблицы

In [13]:
train_card_spending_df = preprocess_data(train_card_spending_df, mode='categorical')
train_mcc_operations_df = preprocess_data(train_mcc_operations_df, mode='categorical')
train_mcc_preferences_df = preprocess_data(train_mcc_preferences_df, mode='categorical')

In [14]:
#train_card_spending_df = remove_temporal_duplicates(train_card_spending_df)
#train_mcc_operations_df = remove_temporal_duplicates(train_mcc_operations_df)
#train_mcc_preferences_df = remove_temporal_duplicates(train_mcc_preferences_df)

In [15]:
train_card_spending_df = filter_by_variance(train_card_spending_df)
train_mcc_operations_df = filter_by_variance(train_mcc_operations_df)
train_mcc_preferences_df = filter_by_variance(train_mcc_preferences_df)

In [16]:
#train_card_spending_df = remove_correlated_features(train_card_spending_df)
#train_mcc_operations_df = remove_correlated_features(train_mcc_operations_df)
#train_mcc_preferences_df = remove_correlated_features(train_mcc_preferences_df)

In [17]:

# обработка nan в train_card_spending_df
df = train_card_spending_df.copy()

mask_cc = df.columns.str.startswith('cc_')# & ~train_full_info.columns.isin(['cnt_prolong_max', 'cnt_prolong_max_5y'])
cols_to_fill = df.columns[mask_cc]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

mask_cnt = df.columns.str.startswith('cnt_')
cols_to_fill = df.columns[mask_cnt]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

mask_sum = df.columns.str.startswith('sum_')
cols_to_fill = df.columns[mask_sum]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

cols_to_fill = ['last_time_pay_tr', 'max_tr_pay_1']
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

mask_max_tr = df.columns.str.startswith('max_tr_') # если без remove_correlated_features
cols_to_fill = df.columns[mask_max_tr]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

df = df.drop('sum_tr_house_rental_1', axis=1) # абсолютно пустая колонка

train_card_spending_df = df.copy()


# обработка nan в train_mcc_operations_df
df = train_mcc_operations_df.copy()

mask_cc = df.columns.str.startswith('cc_')
cols_to_fill = df.columns[mask_cc]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

mask_cnt = df.columns.str.startswith('cnt_')
cols_to_fill = df.columns[mask_cnt]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

mask_sum = df.columns.str.startswith('sum_')
cols_to_fill = df.columns[mask_sum]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

df = df.drop('sum_most_popular_mcc_7321_1m', axis=1) # абсолютно пустая колонка

train_mcc_operations_df = df.copy()


# обработка nan в train_mcc_preferences_df
df = train_mcc_preferences_df.copy()

mask_preferences = df.columns.str.startswith('preferences_')
cols_to_fill = df.columns[mask_preferences]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].median())

train_mcc_preferences_df = df.copy()


# Соединение

In [18]:
train_full_info = pd.merge(train_full_info, train_card_spending_df, on=['target', 'id', 'index'])
train_full_info = pd.merge(train_full_info, train_mcc_operations_df, on=['target', 'id', 'index'])
train_full_info = pd.merge(train_full_info, train_mcc_preferences_df, on=['target', 'id', 'index'])

In [19]:
df = train_full_info.copy()
df

Unnamed: 0,target,id,index,app_income_app,avg_dep_avg_balance_12month_amt,avg_dep_avg_balance_12month_amt_term,avg_dep_avg_balance_12month_amt_term_savings,avg_dep_avg_balance_1month_amt,avg_dep_avg_balance_1month_amt_term,avg_dep_avg_balance_1month_amt_term_savings,...,preferences_div_cnt_tr_cat_cash_services_12m,preferences_div_cnt_tr_cat_restaurants_fastfood_12m,preferences_div_cnt_tr_cat_supermarkets_12m,preferences_div_cnt_tr_cat_telecommunication_12m,preferences_div_cnt_tr_cat_transportation_other_12m,preferences_div_cnt_tr_cat_unknown_12m,preferences_div_sum_tr_cat_cash_services_12m,preferences_div_sum_tr_cat_supermarkets_12m,preferences_div_sum_tr_cat_telecommunication_12m,preferences_div_sum_tr_cat_unknown_12m
0,0.000000,97678374,0,71867.645241,121030.757812,252079.132812,4027.373535,8.947612e+04,207176.398438,9910.662109,...,0.042429,0.046565,0.258621,0.000000,0.004525,0.000000,0.315492,0.097882,0.000054,0.000064
1,0.000000,62472650,1,71867.645241,121030.757812,252079.132812,132074.539062,8.947612e+04,207176.398438,91689.093750,...,0.042429,0.046565,0.258621,0.000000,0.004525,0.000000,0.315492,0.097882,0.000054,0.000064
2,219932.906250,94308112,2,105309.652023,315208.781250,252079.132812,274816.375000,2.356619e+05,207176.398438,225698.500000,...,0.051095,0.218978,0.124088,0.043796,0.000000,0.000000,0.531944,0.026082,0.026841,0.000000
3,631.770020,68994873,3,339.378130,43187.953125,252079.132812,5277.233887,2.844051e+04,207176.398438,0.000000,...,0.160920,0.022989,0.149425,0.057471,0.011494,0.126437,0.470917,0.173399,0.010788,0.076318
4,0.000000,78127603,4,71867.645241,121030.757812,252079.132812,0.000000,8.947612e+04,207176.398438,0.000000,...,0.042429,0.046565,0.258621,0.000000,0.004525,0.000000,0.315492,0.097882,0.000054,0.000064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85584,0.000000,6163462,85584,50582.660400,14605.051758,252079.132812,6170.456055,6.814772e+03,207176.398438,0.000000,...,0.013158,0.039474,0.328947,0.039474,0.276316,0.000000,0.037769,0.258943,0.135032,0.000088
85585,841071.000000,22320709,85585,413239.799133,510666.125000,252079.132812,397536.968750,1.012610e+06,207176.398438,850407.312500,...,0.016696,0.288225,0.187170,0.002636,0.043937,0.000879,0.286421,0.096337,0.002220,0.000620
85586,27.990000,23863045,85586,76633.474609,4456.615723,252079.132812,0.000000,0.000000e+00,207176.398438,8278.396484,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000036,0.000000,0.000000
85587,0.000000,33408678,85587,71867.645241,3737.191895,252079.132812,433.325165,8.750564e+04,207176.398438,79891.273438,...,0.016949,0.000000,0.508475,0.000000,0.084746,0.000000,0.111758,0.488700,0.000000,0.000004


In [20]:
#df = filter_by_variance(df)
#df

In [21]:
#df = remove_correlated_features(df)
#df

In [22]:
#df = filter_by_low_correlation(df)
#df

In [23]:
df = preprocess_data(df)

In [24]:
train_full_info = df.copy()
train_full_info.to_csv('processed_data.csv', index=False)

# Данные для сабмита

In [25]:
def filter_columns_by_order(source_df, target_df, raise_if_missing=False):
    """
    Оставляет в target_df только колонки из source_df, сохраняя их исходный порядок.
    
    Параметры:
    - source_df: датафрейм, чьи колонки и их порядок нужно сохранить
    - target_df: датафрейм для фильтрации
    - raise_if_missing: если True, вызывает ошибку при отсутствии колонок из source_df в target_df
    
    Возвращает:
    - Новый датафрейм с колонками в порядке source_df
    
    Исключения:
    - ValueError: если колонки из source_df отсутствуют в target_df и raise_if_missing=True
    """
    # Получаем порядок колонок из source_df
    source_columns_order = source_df.columns.tolist()
    
    # Проверяем наличие всех колонок в target_df
    missing_columns = [col for col in source_columns_order if col not in target_df.columns]
    
    if missing_columns and raise_if_missing:
        raise ValueError(f"Отсутствуют колонки в target_df: {missing_columns}")
    
    # Выбираем только существующие колонки в нужном порядке
    columns_to_keep = [col for col in source_columns_order if col in target_df.columns]
    
    return target_df[columns_to_keep]

In [26]:
# тестовые данные
test_card_spending_df = pd.read_parquet('input_data/test_card_spending_df.parquet')
test_main_df = pd.read_parquet('input_data/test_main_df.parquet')
test_mcc_operations_df = pd.read_parquet('input_data/test_mcc_operations_df.parquet')
test_mcc_preferences_df = pd.read_parquet('input_data/test_mcc_preferences_df.parquet')

In [27]:
test_main_df['index'] = test_main_df.index
test_card_spending_df['index'] = test_card_spending_df.index
test_mcc_operations_df['index'] = test_mcc_operations_df.index
test_mcc_preferences_df['index'] = test_mcc_preferences_df.index

In [28]:
test_full_info = pd.merge(test_main_df, test_card_spending_df, on=['id', 'index'])
test_full_info = pd.merge(test_full_info, test_mcc_operations_df, on=['id', 'index'])
test_full_info = pd.merge(test_full_info, test_mcc_preferences_df, on=['id', 'index'])

In [29]:
test_full_info = filter_columns_by_order(train_full_info, test_full_info)

In [30]:
test_full_info.to_csv('test_data.csv', index=False)

In [31]:
test_full_info.shape

(127756, 2475)

In [32]:
train_full_info.shape

(85589, 2476)

In [33]:
train_full_info

Unnamed: 0,target,id,index,app_income_app,avg_dep_avg_balance_12month_amt,avg_dep_avg_balance_12month_amt_term,avg_dep_avg_balance_12month_amt_term_savings,avg_dep_avg_balance_1month_amt,avg_dep_avg_balance_1month_amt_term,avg_dep_avg_balance_1month_amt_term_savings,...,preferences_div_cnt_tr_cat_cash_services_12m,preferences_div_cnt_tr_cat_restaurants_fastfood_12m,preferences_div_cnt_tr_cat_supermarkets_12m,preferences_div_cnt_tr_cat_telecommunication_12m,preferences_div_cnt_tr_cat_transportation_other_12m,preferences_div_cnt_tr_cat_unknown_12m,preferences_div_sum_tr_cat_cash_services_12m,preferences_div_sum_tr_cat_supermarkets_12m,preferences_div_sum_tr_cat_telecommunication_12m,preferences_div_sum_tr_cat_unknown_12m
0,0.000000,97678374,0,71867.645241,121030.757812,252079.132812,4027.373535,8.947612e+04,207176.398438,9910.662109,...,0.042429,0.046565,0.258621,0.000000,0.004525,0.000000,0.315492,0.097882,0.000054,0.000064
1,0.000000,62472650,1,71867.645241,121030.757812,252079.132812,132074.539062,8.947612e+04,207176.398438,91689.093750,...,0.042429,0.046565,0.258621,0.000000,0.004525,0.000000,0.315492,0.097882,0.000054,0.000064
2,219932.906250,94308112,2,105309.652023,315208.781250,252079.132812,274816.375000,2.356619e+05,207176.398438,225698.500000,...,0.051095,0.218978,0.124088,0.043796,0.000000,0.000000,0.531944,0.026082,0.026841,0.000000
3,631.770020,68994873,3,339.378130,43187.953125,252079.132812,5277.233887,2.844051e+04,207176.398438,0.000000,...,0.160920,0.022989,0.149425,0.057471,0.011494,0.126437,0.470917,0.173399,0.010788,0.076318
4,0.000000,78127603,4,71867.645241,121030.757812,252079.132812,0.000000,8.947612e+04,207176.398438,0.000000,...,0.042429,0.046565,0.258621,0.000000,0.004525,0.000000,0.315492,0.097882,0.000054,0.000064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85584,0.000000,6163462,85584,50582.660400,14605.051758,252079.132812,6170.456055,6.814772e+03,207176.398438,0.000000,...,0.013158,0.039474,0.328947,0.039474,0.276316,0.000000,0.037769,0.258943,0.135032,0.000088
85585,841071.000000,22320709,85585,413239.799133,510666.125000,252079.132812,397536.968750,1.012610e+06,207176.398438,850407.312500,...,0.016696,0.288225,0.187170,0.002636,0.043937,0.000879,0.286421,0.096337,0.002220,0.000620
85586,27.990000,23863045,85586,76633.474609,4456.615723,252079.132812,0.000000,0.000000e+00,207176.398438,8278.396484,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000036,0.000000,0.000000
85587,0.000000,33408678,85587,71867.645241,3737.191895,252079.132812,433.325165,8.750564e+04,207176.398438,79891.273438,...,0.016949,0.000000,0.508475,0.000000,0.084746,0.000000,0.111758,0.488700,0.000000,0.000004


In [34]:
test_full_info

Unnamed: 0,id,index,app_income_app,avg_dep_avg_balance_12month_amt,avg_dep_avg_balance_12month_amt_term,avg_dep_avg_balance_12month_amt_term_savings,avg_dep_avg_balance_1month_amt,avg_dep_avg_balance_1month_amt_term,avg_dep_avg_balance_1month_amt_term_savings,avg_dep_avg_balance_3month_amt,...,preferences_div_cnt_tr_cat_cash_services_12m,preferences_div_cnt_tr_cat_restaurants_fastfood_12m,preferences_div_cnt_tr_cat_supermarkets_12m,preferences_div_cnt_tr_cat_telecommunication_12m,preferences_div_cnt_tr_cat_transportation_other_12m,preferences_div_cnt_tr_cat_unknown_12m,preferences_div_sum_tr_cat_cash_services_12m,preferences_div_sum_tr_cat_supermarkets_12m,preferences_div_sum_tr_cat_telecommunication_12m,preferences_div_sum_tr_cat_unknown_12m
0,13423144,0,71645.421875,1.316281e+05,,8.547828e+04,1.241686e+05,,0.000000e+00,9.433326e+04,...,0.090909,0.000000,0.000000,0.454545,0.363636,0.000000,0.989110,0.000000,0.000165,0.000000
1,40935650,1,,3.000000e-05,,,4.542342e+03,,,0.000000e+00,...,,,,,,,,,,
2,78343612,2,141846.140625,5.467297e+05,,5.229132e+05,2.100188e+04,,1.548335e+04,2.065226e+04,...,0.011038,0.270052,0.232524,0.000000,0.182487,0.029433,0.242914,0.155051,0.000000,0.008563
3,87022696,3,249199.406250,1.017577e+06,1.021503e+06,1.016772e+06,1.041060e+06,1.037141e+06,1.035684e+06,1.037593e+06,...,,,,,,,,,,
4,75975270,4,152053.625000,2.053317e+05,,1.928096e+05,2.351390e+04,,3.049221e+04,5.050394e+04,...,0.043614,0.112150,0.570093,0.028037,0.012461,0.052960,0.558240,0.093957,0.002376,0.020390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127751,25505983,127751,,2.753942e+05,4.438964e+04,1.184002e+05,2.621852e+05,5.429364e+04,1.101635e+05,2.675314e+05,...,0.086879,0.044326,0.333333,0.021277,0.024823,0.005319,0.216565,0.285388,0.027035,0.004319
127752,75029121,127752,71090.882812,1.899698e+04,,,3.176488e+04,,,3.135360e+04,...,0.012048,0.000000,0.228916,0.096386,0.000000,0.012048,0.000847,0.035438,0.148516,0.004758
127753,17920594,127753,,4.296711e+04,,2.294632e+04,1.386265e+04,,0.000000e+00,4.905979e+04,...,0.016949,0.411017,0.088983,0.000000,0.080508,0.169492,0.049188,0.036846,0.000062,0.063918
127754,69398179,127754,89401.125000,9.688291e+03,,,7.376737e+03,,,3.170506e+03,...,0.000000,0.030708,0.246996,0.004005,0.012016,0.001335,0.000000,0.215141,0.002946,0.000329
