In [1]:
import pandas as pd
pd.options.display.float_format = '{:.6f}'.format

In [2]:
import gc

gc.collect()

0

In [3]:
train = pd.read_csv('data/application_train.csv')
test_transformed = pd.read_csv('data/application_test.csv')

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

X = train.drop(['TARGET'], axis=1)
y = train['TARGET']

X_sk_id_curr = X['SK_ID_CURR']
test_sk_id_curr = test_transformed['SK_ID_CURR']
X = X.drop(columns='SK_ID_CURR')
test_transformed = test_transformed.drop(columns='SK_ID_CURR')

# Определяем числовые и категориальные столбцы
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Разделяем данные на числовые и категориальные
X_numerical = X[numerical_columns]
X_categorical = X[categorical_columns]
test_numerical = test_transformed[numerical_columns]
test_categorical = test_transformed[categorical_columns]

# Создаем объекты для заполнения пропусков и масштабирования
imputer = SimpleImputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0, 1))

# Обрабатываем числовые данные с помощью созданных выше объектов
imputer.fit(X_numerical)
X_numerical = imputer.transform(X_numerical)
test_numerical = imputer.transform(test_numerical)

scaler.fit(X_numerical)
X_numerical = scaler.transform(X_numerical)
test_numerical = scaler.transform(test_numerical)

# Преобразуем числовые данные обратно в DataFrame
X_numerical = pd.DataFrame(X_numerical, columns=numerical_columns)
test_numerical = pd.DataFrame(test_numerical, columns=numerical_columns)

# Применяем get_dummies к категориальным данным
X_categorical = pd.get_dummies(X_categorical, drop_first=True)
test_categorical = pd.get_dummies(test_categorical, drop_first=True)

# Приводим тестовые данные к тому же набору столбцов, что и тренировочные
X_categorical, test_categorical = X_categorical.align(test_categorical, join='inner', axis=1)

# Объединяем обработанные данные
X = pd.concat([X_numerical, X_categorical.reset_index(drop=True)], axis=1)
test_transformed = pd.concat([test_numerical, test_categorical.reset_index(drop=True)], axis=1)

# Обратаывать id не нужно, поэтому сначала мы его убрали, теперь возвращаем после обработки
X.insert(0, 'SK_ID_CURR', X_sk_id_curr)
test_transformed.insert(0, 'SK_ID_CURR', test_sk_id_curr)

In [5]:
X

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
0,100002,0.000000,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,0.045086,0.852140,...,False,False,False,False,False,False,False,True,False,False
1,100003,0.000000,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,0.043648,0.951929,...,False,False,False,False,False,False,False,False,False,False
2,100004,0.000000,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,0.046161,0.827335,...,False,False,False,False,False,False,False,False,False,False
3,100006,0.000000,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,0.038817,0.601451,...,False,False,False,False,False,False,False,False,False,False
4,100007,0.000000,0.000819,0.116854,0.078975,0.117845,0.392880,0.298591,0.038820,0.825268,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.000000,0.001127,0.052360,0.101176,0.046016,0.446855,0.896392,0.046133,0.657263,...,False,False,False,False,False,False,False,True,False,False
307507,456252,0.000000,0.000396,0.056067,0.040505,0.046016,0.344429,0.251071,1.000000,0.822147,...,False,False,False,False,False,False,False,True,False,False
307508,456253,0.000000,0.001089,0.157969,0.110618,0.135802,0.065247,0.578523,0.026076,0.726937,...,False,False,False,False,False,False,True,False,False,False
307509,456254,0.000000,0.001243,0.081175,0.072499,0.069585,0.069553,0.747914,0.034258,0.896158,...,False,False,False,False,False,False,False,True,False,False


In [6]:
POS_CASH = pd.read_csv('data/POS_CASH_balance.csv')

# Выкинули ненужный столбец
POS_CASH = POS_CASH.drop(columns='SK_ID_PREV')

# one-hot
POS_CASH = pd.get_dummies(POS_CASH)

# Группировка по  SK_ID_CURR
POS_CASH = POS_CASH.groupby('SK_ID_CURR').mean().reset_index()

# Удаление столбцов с большим количеством nan 
#nan_counts = POS_CASH.isna().sum()/len(POS_CASH)
#columns_to_keep = nan_counts[nan_counts <= 0.3].index
#POS_CASH = POS_CASH[columns_to_keep]

# Заменить null на среднее
POS_CASH = POS_CASH.apply(lambda col: col.fillna(col.mean()), axis=0)

# Удаляем столбец с id, чтобы он не подвергся обработке
sk_id_curr = POS_CASH['SK_ID_CURR']
POS_CASH = POS_CASH.drop(columns=['SK_ID_CURR'])
col =  POS_CASH.columns

# Масштабирование
scaler.fit(POS_CASH)
POS_CASH = scaler.transform(POS_CASH)

# Возвращаем id
POS_CASH = pd.DataFrame(POS_CASH, columns=col)
POS_CASH.insert(0, 'SK_ID_CURR', sk_id_curr)
POS_CASH

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA
0,100001,0.246784,0.042254,0.024074,0.000297,0.000447,0.777778,0.000000,0.000000,0.000000,0.222222,0.000000,0.000000,0.000000,0.000000
1,100002,0.905263,0.323944,0.250000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,100003,0.549624,0.128270,0.096429,0.000000,0.000000,0.928571,0.000000,0.000000,0.000000,0.071429,0.000000,0.000000,0.000000,0.000000
3,100004,0.742105,0.038732,0.037500,0.000000,0.000000,0.750000,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,0.000000,0.000000
4,100005,0.800000,0.150704,0.120000,0.000000,0.000000,0.818182,0.000000,0.000000,0.000000,0.090909,0.000000,0.000000,0.090909,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337247,456251,0.957895,0.096831,0.072917,0.000000,0.000000,0.777778,0.000000,0.000000,0.000000,0.111111,0.000000,0.000000,0.111111,0.000000
337248,456252,0.178947,0.070423,0.050000,0.000000,0.000000,0.857143,0.000000,0.000000,0.000000,0.142857,0.000000,0.000000,0.000000,0.000000
337249,456253,0.176471,0.080365,0.033333,0.000112,0.000169,0.882353,0.000000,0.000000,0.000000,0.117647,0.000000,0.000000,0.000000,0.000000
337250,456254,0.952105,0.195775,0.172500,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [7]:
bureau = pd.read_csv('data/bureau.csv')
bureau = pd.get_dummies(bureau)
bureau = bureau.drop(columns='SK_ID_BUREAU')
bureau = bureau.groupby(['SK_ID_CURR']).mean().reset_index()
#nan_counts = bureau.isna().sum()/len(bureau)
#columns_to_keep = nan_counts[nan_counts <= 0.3].index
#bureau = bureau[columns_to_keep]
bureau = bureau.apply(lambda col: col.fillna(col.mean()), axis=0)
sk_id_curr = bureau['SK_ID_CURR']
bureau = bureau.drop(columns=['SK_ID_CURR'])
col =  bureau.columns
scaler.fit(bureau)
bureau = scaler.transform(bureau)
bureau = pd.DataFrame(bureau, columns=col)
bureau.insert(0, 'SK_ID_CURR', sk_id_curr)
bureau

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
0,100001,0.748460,0.000000,0.574185,0.901450,0.000043,0.000000,0.001048,0.022123,0.021291,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,100002,0.700890,0.000000,0.568281,0.916731,0.000014,0.000000,0.000546,0.021440,0.023030,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,100003,0.520619,0.000000,0.565606,0.868999,0.000000,0.000000,0.001284,0.020510,0.065332,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,100004,0.703285,0.000000,0.566372,0.936429,0.000000,0.000000,0.000477,0.020510,0.021291,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,100005,0.934748,0.000000,0.579069,0.985316,0.000000,0.000000,0.001106,0.024096,0.021291,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305806,456249,0.429474,0.000000,0.556193,0.837074,0.000053,0.000000,0.001435,0.020819,0.021291,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
305807,456250,0.704997,0.000000,0.590688,0.909270,0.000000,0.000000,0.005194,0.034592,0.025515,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
305808,456253,0.703114,0.000000,0.576896,0.905211,0.000043,0.000000,0.004998,0.029008,0.021291,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
305809,456254,0.622177,0.000000,0.561302,0.897451,0.000043,0.000000,0.000227,0.020510,0.022573,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [8]:
previous = pd.read_csv('data/previous_application.csv')
previous = previous.drop(columns='SK_ID_PREV')
previous = pd.get_dummies(previous)
previous = previous.groupby('SK_ID_CURR').mean().reset_index()
#nan_counts = previous.isna().sum()/len(previous)
#columns_to_keep = nan_counts[nan_counts <= 0.3].index
#previous = previous[columns_to_keep]
previous = previous.apply(lambda col: col.fillna(col.mean()), axis=0)
sk_id_curr = previous['SK_ID_CURR']
previous = previous.drop(columns=['SK_ID_CURR'])
col =  previous.columns
scaler.fit(previous)
previous = scaler.transform(previous)
previous = pd.DataFrame(previous, columns=col)
previous.insert(0, 'SK_ID_CURR', sk_id_curr)
previous

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,...,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest
0,100001,0.013151,0.006132,0.005873,0.001245,0.006132,0.565217,1.000000,0.105414,0.159496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
1,100002,0.030796,0.044211,0.044211,0.000000,0.044211,0.391304,1.000000,0.000008,0.159496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000
2,100003,0.188246,0.107515,0.119553,0.001700,0.107515,0.637681,1.000000,0.050556,0.159496,...,0.333333,0.000000,0.333333,0.000000,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000
3,100004,0.017832,0.005996,0.004964,0.002400,0.005996,0.217391,1.000000,0.214211,0.159496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
4,100005,0.016021,0.005508,0.004957,0.002205,0.011017,0.456522,1.000000,0.110100,0.159496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338852,456251,0.021989,0.009989,0.009989,0.000000,0.009989,0.739130,1.000000,0.000008,0.159496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
338853,456252,0.033534,0.014221,0.014030,0.001707,0.014221,0.434783,1.000000,0.063097,0.159496,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
338854,456253,0.015879,0.005966,0.005093,0.002175,0.005966,0.500000,1.000000,0.216544,0.159496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
338855,456254,0.035553,0.029955,0.033195,0.000000,0.029955,0.652174,1.000000,0.000008,0.159496,...,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000


In [9]:
installments_payments = pd.read_csv('data/installments_payments.csv')
installments_payments = pd.get_dummies(installments_payments)
installments_payments = installments_payments.drop(columns='SK_ID_PREV')
installments_payments = installments_payments.groupby('SK_ID_CURR').mean().reset_index()
#nan_counts = installments_payments.isna().sum()/len(installments_payments)
#columns_to_keep = nan_counts[nan_counts <= 0.3].index
#installments_payments = installments_payments[columns_to_keep]
installments_payments = installments_payments.apply(lambda col: col.fillna(col.mean()), axis=0)
sk_id_curr = installments_payments['SK_ID_CURR']
installments_payments = installments_payments.drop(columns=['SK_ID_CURR'])
col = installments_payments.columns
scaler.fit(installments_payments)
installments_payments = scaler.transform(installments_payments)
installments_payments = pd.DataFrame(installments_payments, columns=col)
installments_payments.insert(0, 'SK_ID_CURR', sk_id_curr)
installments_payments

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,100001,0.029304,0.011971,0.251554,0.285528,0.002350,0.002350
1,100002,0.026991,0.062846,0.899966,0.898168,0.004615,0.004615
2,100003,0.026667,0.028490,0.528893,0.549439,0.025854,0.025854
3,100004,0.034188,0.006983,0.742720,0.752716,0.002833,0.002833
4,100005,0.028490,0.027931,0.800274,0.802296,0.002492,0.002491
...,...,...,...,...,...,...,...
339582,456251,0.029304,0.020949,0.959918,0.950037,0.002992,0.002992
339583,456252,0.025641,0.017457,0.181912,0.220719,0.004021,0.004020
339584,456253,0.025641,0.026435,0.188103,0.222807,0.001757,0.001643
339585,456254,0.025641,0.029769,0.952291,0.948415,0.004088,0.004088


In [10]:
credit_card_balance = pd.read_csv('data/credit_card_balance.csv')
credit_card_balance = credit_card_balance.drop(columns='SK_ID_PREV')
credit_card_balance = pd.get_dummies(credit_card_balance)
credit_card_balance = credit_card_balance.groupby('SK_ID_CURR').mean().reset_index()
#nan_counts = credit_card_balance.isna().sum()/len(credit_card_balance)
#columns_to_keep = nan_counts[nan_counts <= 0.3].index
#credit_card_balance = credit_card_balance[columns_to_keep]
credit_card_balance = credit_card_balance.apply(lambda col: col.fillna(col.mean()), axis=0)
sk_id_curr = POS_CASH['SK_ID_CURR']
credit_card_balance = credit_card_balance.drop(columns=['SK_ID_CURR'])
col = credit_card_balance.columns
scaler.fit(credit_card_balance)
credit_card_balance = scaler.transform(credit_card_balance)
credit_card_balance = pd.DataFrame(credit_card_balance, columns=col)
credit_card_balance.insert(0, 'SK_ID_CURR', sk_id_curr)
credit_card_balance

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,...,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed
0,100001,0.949166,0.003145,0.200000,0.013287,0.000011,0.002595,0.005191,0.000000,0.011258,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,100002,0.237491,0.061627,0.121622,0.002689,0.001516,0.000000,0.000000,0.091975,0.003040,...,0.352672,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,100003,0.034156,0.022638,0.097569,0.007020,0.003694,0.000000,0.000000,0.033815,0.004500,...,0.256207,0.000006,0.000006,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,100004,0.816998,0.003145,0.500000,0.013287,0.000011,0.002595,0.005191,0.000000,0.011258,...,0.000000,0.000000,0.000000,0.411765,0.000000,0.588235,0.000000,0.000000,0.000000,0.000000
4,100005,0.867832,0.003145,0.100000,0.013287,0.000011,0.002595,0.005191,0.000000,0.011258,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103553,209395,0.593329,0.144657,0.219512,0.027060,0.016619,0.000000,0.001462,0.151443,0.020539,...,0.186142,0.000000,0.000000,0.878049,0.000000,0.121951,0.000000,0.000000,0.000000,0.000000
103554,209396,0.908499,0.017246,0.100000,0.000000,0.009415,0.000000,0.009404,0.033458,0.011787,...,0.047904,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
103555,209398,0.023989,0.028066,0.106667,0.002362,0.001341,0.000000,0.000008,0.032889,0.003066,...,0.362631,0.000019,0.000013,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
103556,209399,0.755997,0.003145,0.666667,0.013287,0.000011,0.002595,0.005191,0.000000,0.011258,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [11]:
# Мерджим всё полученное в одну обучающую выборку по sk_id_curr

X = X.merge(right=POS_CASH.reset_index(drop=True), how='left', on='SK_ID_CURR')
test_transformed = test_transformed.merge(right=POS_CASH.reset_index(drop=True), how='left', on='SK_ID_CURR')

X = X.merge(right=bureau.reset_index(drop=True), how='left', on='SK_ID_CURR')
test_transformed = test_transformed.merge(right=bureau.reset_index(drop=True), how='left', on='SK_ID_CURR')

X = X.merge(right=previous.reset_index(drop=True), how='left', on='SK_ID_CURR')
test_transformed = test_transformed.merge(right=previous.reset_index(drop=True), how='left', on='SK_ID_CURR')

X = X.merge(right=installments_payments.reset_index(drop=True), how='left', on='SK_ID_CURR')
test_transformed = test_transformed.merge(right=installments_payments.reset_index(drop=True), how='left', on='SK_ID_CURR')

X = X.merge(right=credit_card_balance.reset_index(drop=True), how='left', on='SK_ID_CURR')
test_transformed = test_transformed.merge(right=credit_card_balance.reset_index(drop=True), how='left', on='SK_ID_CURR')

In [12]:
X

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,CNT_INSTALMENT_MATURE_CUM,SK_DPD_y,SK_DPD_DEF_y,NAME_CONTRACT_STATUS_Active_y,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed_y,NAME_CONTRACT_STATUS_Demand_y,NAME_CONTRACT_STATUS_Refused_y,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed_y
0,100002,0.000000,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,0.045086,0.852140,...,0.352672,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,100003,0.000000,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,0.043648,0.951929,...,0.256207,0.000006,0.000006,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,100004,0.000000,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,0.046161,0.827335,...,0.000000,0.000000,0.000000,0.411765,0.000000,0.588235,0.000000,0.000000,0.000000,0.000000
3,100006,0.000000,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,0.038817,0.601451,...,0.267547,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,100007,0.000000,0.000819,0.116854,0.078975,0.117845,0.392880,0.298591,0.038820,0.825268,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.000000,0.001127,0.052360,0.101176,0.046016,0.446855,0.896392,0.046133,0.657263,...,,,,,,,,,,
307507,456252,0.000000,0.000396,0.056067,0.040505,0.046016,0.344429,0.251071,1.000000,0.822147,...,,,,,,,,,,
307508,456253,0.000000,0.001089,0.157969,0.110618,0.135802,0.065247,0.578523,0.026076,0.726937,...,,,,,,,,,,
307509,456254,0.000000,0.001243,0.081175,0.072499,0.069585,0.069553,0.747914,0.034258,0.896158,...,,,,,,,,,,


In [14]:
# В каких-то таблицах, которые мерджили, могло не быть каких-то id, но т.к мерджим через left, то не найденный id будет null
nan_counts = X.isna().sum() / len(X)
columns_to_keep = nan_counts[nan_counts <= 0.2].index

X = X[columns_to_keep]
test_transformed = test_transformed[columns_to_keep]

X = X.apply(lambda col: col.fillna(col.mean()), axis=0)
test_transformed = test_transformed.apply(lambda col: col.fillna(col.mean()), axis=0)

In [15]:
def clean_column_names(df):
    df.columns = (
        df.columns
        .str.replace('[^A-Za-z0-9_]', '_', regex=True)
        .str.strip('_')
    )
    return df
X = clean_column_names(X)
test_transformed = clean_column_names(test_transformed)

In [25]:
X

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,PRODUCT_COMBINATION_POS_mobile_with_interest,PRODUCT_COMBINATION_POS_mobile_without_interest,PRODUCT_COMBINATION_POS_other_with_interest,PRODUCT_COMBINATION_POS_others_without_interest,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,100002,0.000000,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,0.045086,0.852140,...,0.000000,0.000000,1.000000,0.000000,0.026991,0.062846,0.899966,0.898168,0.004615,0.004615
1,100003,0.000000,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,0.043648,0.951929,...,0.000000,0.000000,0.000000,0.000000,0.026667,0.028490,0.528893,0.549439,0.025854,0.025854
2,100004,0.000000,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,0.046161,0.827335,...,0.000000,1.000000,0.000000,0.000000,0.034188,0.006983,0.742720,0.752716,0.002833,0.002833
3,100006,0.000000,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,0.038817,0.601451,...,0.000000,0.000000,0.000000,0.000000,0.028846,0.024004,0.914611,0.912443,0.025133,0.025133
4,100007,0.000000,0.000819,0.116854,0.078975,0.117845,0.392880,0.298591,0.038820,0.825268,...,0.166667,0.000000,0.000000,0.000000,0.029915,0.042214,0.648645,0.664523,0.005057,0.004877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.000000,0.001127,0.052360,0.101176,0.046016,0.446855,0.896392,0.046133,0.657263,...,1.000000,0.000000,0.000000,0.000000,0.029304,0.020949,0.959918,0.950037,0.002992,0.002992
307507,456252,0.000000,0.000396,0.056067,0.040505,0.046016,0.344429,0.251071,1.000000,0.822147,...,0.000000,0.000000,0.000000,0.000000,0.025641,0.017457,0.181912,0.220719,0.004021,0.004020
307508,456253,0.000000,0.001089,0.157969,0.110618,0.135802,0.065247,0.578523,0.026076,0.726937,...,1.000000,0.000000,0.000000,0.000000,0.025641,0.026435,0.188103,0.222807,0.001757,0.001643
307509,456254,0.000000,0.001243,0.081175,0.072499,0.069585,0.069553,0.747914,0.034258,0.896158,...,0.500000,0.000000,0.000000,0.000000,0.025641,0.029769,0.952291,0.948415,0.004088,0.004088


In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Словарь для хранения лучших моделей
best_models = {}
x_without_id = X.drop(columns='SK_ID_CURR')
# Определяем StratifiedKFold для кросс-валидации с 7 фолдами
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# Функция для подбора гиперпараметров RandomForest с кросс-валидацией
def optimize_random_forest(trial):
    # Сетка параметров
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"])
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, x_without_id, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Оптимизация гиперпараметров для RandomForest
print("Оптимизация гиперпараметров для RandomForest...")
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(optimize_random_forest, n_trials=30)  # 30 раз выбрать параметры и прогнать кросс-валидацию на 7 фолдов (1 trial - 7 моделей)
best_models["RandomForest"] = {
    "best_params": study_rf.best_params,
    "best_score": study_rf.best_value,
}

# Вывод лучших параметров и результатов для RandomForest
print(f"\nМодель: RandomForest")
print(f"Лучшие параметры: {best_models['RandomForest']['best_params']}")
print(f"Лучший ROC-AUC: {best_models['RandomForest']['best_score']:.4f}")

[I 2025-01-20 18:52:30,694] A new study created in memory with name: no-name-a7560af6-63a4-44b8-8fa1-b138eaaab0c8


Оптимизация гиперпараметров для RandomForest...


[I 2025-01-20 19:00:55,898] Trial 0 finished with value: 0.746710227368851 and parameters: {'n_estimators': 191, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 0 with value: 0.746710227368851.
[I 2025-01-20 19:07:39,803] Trial 1 finished with value: 0.749297199194043 and parameters: {'n_estimators': 198, 'max_depth': 14, 'min_samples_split': 14, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 1 with value: 0.749297199194043.
[I 2025-01-20 19:17:59,406] Trial 2 finished with value: 0.7510211916884353 and parameters: {'n_estimators': 266, 'max_depth': 17, 'min_samples_split': 18, 'min_samples_leaf': 8, 'criterion': 'gini'}. Best is trial 2 with value: 0.7510211916884353.
[I 2025-01-20 19:28:16,577] Trial 3 finished with value: 0.7573189676833126 and parameters: {'n_estimators': 246, 'max_depth': 20, 'min_samples_split': 17, 'min_samples_leaf': 15, 'criterion': 'entropy'}. Best is trial 3 with value: 0.7573189676833126.
[I 20

In [None]:
# Для удобства и возможности отойти от ПК запихано в одну ячейку последовательное обучение 3 видов бустинга, по аналогии с решающим лесом

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Словарь для хранения лучших моделей
best_models = {}
x_without_id = X.drop(columns='SK_ID_CURR')
# Определяем StratifiedKFold для кросс-валидации
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Функция для подбора гиперпараметров XGBoost с кросс-валидацией
def optimize_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 150, 250),
        "max_depth": trial.suggest_int("max_depth", 4, 6),
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.2),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "subsample": trial.suggest_float("subsample", 0.6, 0.8),
        "min_child_weight": trial.suggest_int("min_child_weight", 3, 10),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.1, 5),
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    scores = cross_val_score(model, x_without_id, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Функция для подбора гиперпараметров LightGBM с кросс-валидацией
def optimize_lightgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 300),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 0, 10),
        "lambda_l2": trial.suggest_float("lambda_l2", 0, 10),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 1),
        "max_bin": trial.suggest_int("max_bin", 128, 512),
    }
    model = LGBMClassifier(**params, random_state=42, verbose=-1)
    scores = cross_val_score(model, x_without_id, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Функция для подбора гиперпараметров CatBoost с кросс-валидацией
def optimize_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 200, 300),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
    }
    model = CatBoostClassifier(**params, random_state=42, verbose=0)
    scores = cross_val_score(model, x_without_id, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Оптимизация гиперпараметров для каждой модели с расширенными гиперпараметрами
for model_name, optimize_function in zip(
    ["XGBoost", "LightGMB", "CatBoost"],
    [optimize_xgboost, optimize_lightgbm, optimize_catboost]
):
    print(f"Оптимизация гиперпараметров для {model_name}...")
    study = optuna.create_study(direction="maximize")
    study.optimize(optimize_function, n_trials=30)  # Увеличьте количество испытаний при необходимости
    best_models[model_name] = {
        "best_params": study.best_params,
        "best_score": study.best_value,
    }

# Вывод лучших параметров и результатов
for model_name, results in best_models.items():
    print(f"\nМодель: {model_name}")
    print(f"Лучшие параметры: {results['best_params']}")
    print(f"Лучший ROC-AUC: {results['best_score']:.4f}")

[I 2025-01-20 20:50:20,798] A new study created in memory with name: no-name-28e49ef4-d930-4dc2-b97c-abeb52224b98


Оптимизация гиперпараметров для XGBoost...


[I 2025-01-20 20:53:49,527] Trial 0 finished with value: 0.7694589027315943 and parameters: {'n_estimators': 223, 'max_depth': 6, 'learning_rate': 0.17820289063859623, 'colsample_bytree': 0.8563695876107902, 'subsample': 0.7576748463472734, 'min_child_weight': 8, 'scale_pos_weight': 4.637883673163216}. Best is trial 0 with value: 0.7694589027315943.
[I 2025-01-20 20:57:00,460] Trial 1 finished with value: 0.7775271800541773 and parameters: {'n_estimators': 214, 'max_depth': 4, 'learning_rate': 0.12387999097347009, 'colsample_bytree': 0.827059084554012, 'subsample': 0.7094823436960367, 'min_child_weight': 9, 'scale_pos_weight': 1.1116958862445476}. Best is trial 1 with value: 0.7775271800541773.
[I 2025-01-20 21:00:29,615] Trial 2 finished with value: 0.7762641892438622 and parameters: {'n_estimators': 181, 'max_depth': 6, 'learning_rate': 0.1045963931621669, 'colsample_bytree': 0.8273464762205495, 'subsample': 0.6448589019505132, 'min_child_weight': 6, 'scale_pos_weight': 0.86586403070

light_gbm Trial 3 finished with value: 0.7801349809668889 and parameters: {'n_estimators': 288, 'max_depth': 5, 'learning_rate': 0.09977881793430177, 'num_leaves': 28, 'feature_fraction': 0.7446390547663707, 'bagging_fraction': 0.8423576056913252, 'min_data_in_leaf': 94, 'lambda_l1': 7.562998571707812, 'lambda_l2': 1.5003292498077758, 'min_gain_to_split': 0.857792145092775, 'max_bin': 424}. Best is trial 3 with value: 0.7801349809668889.

In [16]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Определяем StratifiedKFold для кросс-валидации
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Функция для подбора гиперпараметров LogisticRegression с кросс-валидацией
def optimize_logistic_regression(trial):
    # Подбираем параметры
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear", "saga"])
    
    # Проверяем совместимость penalty и solver
    if penalty == "elasticnet" and solver != "saga":
        raise optuna.exceptions.TrialPruned()
    if penalty == "l1" and solver not in ["liblinear", "saga"]:
        raise optuna.exceptions.TrialPruned()
    
    params = {
        "penalty": penalty,
        "C": trial.suggest_float("C", 0.01, 10, log=True),  # Инверсная сила регуляризации
        "solver": solver,
        "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0) if penalty == "elasticnet" else None,
        "max_iter": trial.suggest_int("max_iter", 100, 500),
    }
    
    # Убираем l1_ratio, если не используется elasticnet
    if penalty != "elasticnet":
        params.pop("l1_ratio")
    
    # Создаем модель
    model = LogisticRegression(**params, random_state=42)
    
    # Оцениваем модель с кросс-валидацией
    scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Оптимизация гиперпараметров с Optuna
print("Оптимизация гиперпараметров LogisticRegression...")
study = optuna.create_study(direction="maximize")
study.optimize(optimize_logistic_regression, n_trials=30)

# Вывод лучших параметров и результата
print("\nЛучшие параметры для LogisticRegression:")
print(study.best_params)
print(f"Лучший ROC-AUC: {study.best_value:.4f}")

[I 2025-01-20 17:33:17,422] A new study created in memory with name: no-name-e1e551ba-487c-48c5-a48e-c69692d2454c


Оптимизация гиперпараметров LogisticRegression...


[I 2025-01-20 17:38:26,268] Trial 0 finished with value: 0.7629041376155408 and parameters: {'penalty': 'l1', 'solver': 'liblinear', 'C': 6.209530039157203, 'max_iter': 269}. Best is trial 0 with value: 0.7629041376155408.
[I 2025-01-20 17:42:32,721] Trial 1 finished with value: 0.7618585749665017 and parameters: {'penalty': 'l1', 'solver': 'liblinear', 'C': 0.3403157192927136, 'max_iter': 312}. Best is trial 0 with value: 0.7629041376155408.
[I 2025-01-20 17:42:32,808] Trial 2 pruned. 
[I 2025-01-20 17:42:32,808] Trial 3 pruned. 
[I 2025-01-20 17:45:57,881] Trial 4 finished with value: 0.7627560263172061 and parameters: {'penalty': 'l1', 'solver': 'liblinear', 'C': 4.3657956360908425, 'max_iter': 109}. Best is trial 0 with value: 0.7629041376155408.
[I 2025-01-20 17:53:50,753] Trial 5 finished with value: 0.5022452732492568 and parameters: {'penalty': 'l2', 'solver': 'saga', 'C': 0.011454524162939444, 'max_iter': 240}. Best is trial 0 with value: 0.7629041376155408.
[W 2025-01-20 17:5

## После базового обучения возьмем лучший LightGBM

In [245]:
import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

params = {
    "n_estimators": 288,
    "max_depth": 5,
    "learning_rate": 0.09977881793430177,
    "num_leaves": 28,
    "feature_fraction": 0.7446390547663707,
    "bagging_fraction": 0.8423576056913252,
    "min_data_in_leaf": 94,
    "lambda_l1": 7.562998571707812,
    "lambda_l2": 1.5003292498077758,
    "min_gain_to_split": 0.857792145092775,
    "max_bin": 424,
    "verbose": -1,
}


LightGMB = LGBMClassifier(**params, random_state=42)
LightGMB.fit(x_without_id, y)
preds = LightGMB.predict_proba(test_withoud_id)[:, 1]
submit = test_transformed[['SK_ID_CURR']]
submit['TARGET'] = preds

submit.to_csv('try_lgmc_460.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit['TARGET'] = preds


## Посмотрим, что больше всего влияло на наш лучший бустинг

In [247]:
feature_importances = LightGMB.booster_.feature_importance(importance_type='gain')  # Важность признаков
feature_names = LightGMB.feature_name_  # Имена признаков

# Создаем DataFrame для удобства анализа
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
importances_df['row_number'] = range(len(importances_df))
importances_df[importances_df['row_number'] < 49].tail(50)

Unnamed: 0,Feature,Importance,row_number
29,EXT_SOURCE_3,34818.41082,0
28,EXT_SOURCE_2,33036.20037,1
27,EXT_SOURCE_1,9871.491664,2
6,DAYS_BIRTH,5373.499309,3
4,AMT_GOODS_PRICE_x,3699.530779,4
3,AMT_ANNUITY_x,3327.161721,5
2,AMT_CREDIT_x,3319.334081,6
227,CNT_INSTALMENT_FUTURE,2997.80416,7
105,CODE_GENDER_M,2967.070099,8
333,NAME_CONTRACT_STATUS_Refused_x,2714.190142,9


## Для тех фичей, что влияли, сгенерим полиномиальные и посмотрим на корреляцию с таргетом

In [248]:
from sklearn.preprocessing import PolynomialFeatures
# Выбор столбцов
selected_features = importances_df[importances_df['Importance'] > 1500]['Feature']
X_selected = x_without_id[selected_features]

# Генерация полиномиальных признаков
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X_selected)

# Преобразование в DataFrame
poly_feature_names = poly.get_feature_names_out(selected_features)
X_poly_df = pd.DataFrame(X_poly, columns=poly_feature_names)

# Вычисление корреляции (Пирсон)
correlations_light = X_poly_df.corrwith(y).sort_values(ascending=True)

In [249]:
correlations_light.head(35)

EXT_SOURCE_3 EXT_SOURCE_2                       -0.193927
EXT_SOURCE_3 EXT_SOURCE_2 DAYS_FIRST_DRAWING    -0.191875
EXT_SOURCE_3 EXT_SOURCE_2 EXT_SOURCE_1          -0.188948
EXT_SOURCE_3 EXT_SOURCE_2^2                     -0.176428
EXT_SOURCE_3^2 EXT_SOURCE_2                     -0.172243
EXT_SOURCE_2 EXT_SOURCE_1 DAYS_FIRST_DRAWING    -0.167423
EXT_SOURCE_2 EXT_SOURCE_1                       -0.166046
EXT_SOURCE_3 EXT_SOURCE_1 DAYS_FIRST_DRAWING    -0.163493
EXT_SOURCE_3 EXT_SOURCE_1                       -0.163492
EXT_SOURCE_2 DAYS_FIRST_DRAWING                 -0.161094
EXT_SOURCE_2                                    -0.160295
EXT_SOURCE_3 EXT_SOURCE_2 DAYS_CREDIT           -0.159849
EXT_SOURCE_2^2 EXT_SOURCE_1                     -0.156603
EXT_SOURCE_3                                    -0.155892
EXT_SOURCE_3 DAYS_FIRST_DRAWING                 -0.154768
EXT_SOURCE_2 DAYS_FIRST_DRAWING^2               -0.154484
EXT_SOURCE_2^2 DAYS_FIRST_DRAWING               -0.151789
EXT_SOURCE_3^2

# Отсеем из трейна фичи, которые вообще не важны для бустинга, и добавим туда полниомиальные фичи, которые кореллируют с таргетом

In [250]:
# Фильтрация по важности признаков
selected_features = importances_df[importances_df['Importance'] > 50]['Feature']
X_filtered = x_without_id[selected_features]
#X_filtered = x_without_id

# Добавление признаков с корреляцией по модулю > 0.1
# Убираем дубликаты (добавляем только новые столбцы)
high_corr_features = correlations_light[correlations_light.abs() > 0.1].index
new_features = [feature for feature in high_corr_features if feature not in X_filtered.columns]

# Добавляем только новые признаки
X_filtered = pd.concat([X_filtered, X_poly_df[new_features]], axis=1)

In [251]:
X_filtered

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_BIRTH,AMT_GOODS_PRICE_x,AMT_ANNUITY_x,AMT_CREDIT_x,CNT_INSTALMENT_FUTURE,CODE_GENDER_M,NAME_CONTRACT_STATUS_Refused_x,...,EXT_SOURCE_1 DAYS_FIRST_DRAWING,EXT_SOURCE_2 DAYS_FIRST_DRAWING AMT_CREDIT_SUM_DEBT,EXT_SOURCE_1 DAYS_FIRST_DRAWING^2,EXT_SOURCE_3 AMT_CREDIT_SUM_DEBT,EXT_SOURCE_2 AMT_CREDIT_SUM_DEBT,EXT_SOURCE_2 EXT_SOURCE_1 AMT_ANNUITY_x,EXT_SOURCE_3 DAYS_FIRST_DRAWING DAYS_CREDIT,EXT_SOURCE_3 EXT_SOURCE_1 AMT_ANNUITY_x,DAYS_BIRTH DAYS_CREDIT,DAYS_BIRTH DAYS_CREDIT^2
0,0.155054,0.307542,0.072215,0.888839,0.077441,0.090032,0.090287,0.250000,True,0.000000,...,0.072215,0.006594,0.072215,0.003324,0.006594,0.002000,0.108676,0.001008,0.622978,0.436639
1,0.597163,0.727773,0.312933,0.477114,0.271605,0.132924,0.311736,0.096429,False,0.000000,...,0.312933,0.014927,0.312933,0.012248,0.014927,0.030273,0.310895,0.024840,0.248395,0.129319
2,0.814130,0.650190,0.518318,0.348534,0.023569,0.020025,0.022472,0.037500,True,0.000000,...,0.518318,0.013335,0.518318,0.016698,0.013335,0.006748,0.572566,0.008450,0.245119,0.172389
3,0.597163,0.760751,0.518318,0.350846,0.063973,0.109477,0.066837,0.144167,False,0.111111,...,0.518318,0.017913,0.518318,0.014061,0.017913,0.043168,0.375823,0.033885,0.220804,0.138962
4,0.597163,0.377472,0.518318,0.298591,0.117845,0.078975,0.116854,0.149495,True,0.000000,...,0.518318,0.007742,0.518318,0.012248,0.007742,0.015451,0.362344,0.024444,0.181178,0.109934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0.597163,0.797231,0.138170,0.896392,0.046016,0.101176,0.052360,0.072917,True,0.000000,...,0.138170,0.018772,0.138170,0.014061,0.018772,0.011145,0.375823,0.008348,0.564142,0.355041
307507,0.597163,0.135663,0.518318,0.251071,0.046016,0.040505,0.056067,0.050000,False,0.000000,...,0.518318,0.003194,0.518318,0.014061,0.003194,0.002848,0.375823,0.012537,0.158011,0.099444
307508,0.243815,0.626575,0.769370,0.578523,0.135802,0.110618,0.157969,0.033333,False,0.000000,...,0.769370,0.018175,0.769370,0.007072,0.018175,0.053325,0.171430,0.020750,0.406768,0.286004
307509,0.737587,0.601360,0.518318,0.747914,0.069585,0.072499,0.081175,0.172500,False,0.000000,...,0.518318,0.012334,0.518318,0.015128,0.012334,0.022598,0.458909,0.027717,0.465335,0.289520


## Теперь в тесте провернем то же самое. Уберем фичи, которые убрали в трейне, и расчитаем полиномиальные для теста

In [252]:
# Удаление столбцов, которые не попали в X_filtered
columns_to_keep = X_filtered.columns.intersection(test_transformed.columns)
test_transformed_filtered = test_withoud_id[columns_to_keep]

# Пересчет полиномиальных признаков для тестового набора
# Используем тот же объект `poly`, который был обучен на тренировочном наборе
test_selected_features = test_withoud_id[importances_df[importances_df['Importance'] > 1500]['Feature']]
test_poly = poly.transform(test_selected_features)

# Преобразование в DataFrame с именами столбцов
test_poly_df = pd.DataFrame(test_poly, columns=poly_feature_names, index=test_transformed.index)

# Отбор тех же полиномиальных признаков, которые добавлялись в трейн
new_features_test_df = test_poly_df[new_features]

# Добавление полиномиальных признаков к тестовому набору
test_transformed_filtered = pd.concat([test_transformed_filtered, new_features_test_df], axis=1)

In [253]:
test_transformed_filtered

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_BIRTH,AMT_GOODS_PRICE_x,AMT_ANNUITY_x,AMT_CREDIT_x,CNT_INSTALMENT_FUTURE,CODE_GENDER_M,NAME_CONTRACT_STATUS_Refused_x,...,EXT_SOURCE_1 DAYS_FIRST_DRAWING,EXT_SOURCE_2 DAYS_FIRST_DRAWING AMT_CREDIT_SUM_DEBT,EXT_SOURCE_1 DAYS_FIRST_DRAWING^2,EXT_SOURCE_3 AMT_CREDIT_SUM_DEBT,EXT_SOURCE_2 AMT_CREDIT_SUM_DEBT,EXT_SOURCE_2 EXT_SOURCE_1 AMT_ANNUITY_x,EXT_SOURCE_3 DAYS_FIRST_DRAWING DAYS_CREDIT,EXT_SOURCE_3 EXT_SOURCE_1 AMT_ANNUITY_x,DAYS_BIRTH DAYS_CREDIT,DAYS_BIRTH DAYS_CREDIT^2
0,0.177549,0.923573,0.778428,0.337542,0.102132,0.073886,0.130787,0.024074,False,0.000000,...,0.778428,0.020433,0.778428,0.003928,0.020433,0.053119,0.132889,0.010212,0.252637,0.189089
1,0.482907,0.341118,0.580538,0.403890,0.034792,0.061443,0.044387,0.120000,True,0.000000,...,0.580538,0.008220,0.580538,0.011636,0.008220,0.012168,0.451396,0.017225,0.377535,0.352900
2,0.681715,0.818464,0.518318,0.292616,0.147026,0.265830,0.154373,0.255093,True,0.000000,...,0.518318,0.016787,0.518318,0.013982,0.016787,0.112772,0.276349,0.093930,0.118618,0.048085
3,0.683628,0.596114,0.539134,0.634329,0.382716,0.184872,0.382022,0.168817,False,0.000000,...,0.360203,0.008309,0.240657,0.014262,0.012436,0.059415,0.237632,0.068138,0.330027,0.171706
4,0.597163,0.497880,0.197840,0.687091,0.145903,0.118761,0.144944,0.097436,True,0.000000,...,0.197840,0.011806,0.197840,0.014160,0.011806,0.011698,0.374708,0.014031,0.431136,0.270529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.717489,0.758567,0.518318,0.296449,0.057239,0.061846,0.091775,0.287500,False,0.000000,...,0.518318,0.017156,0.518318,0.016227,0.017156,0.024317,0.538927,0.023000,0.222671,0.167255
48740,0.597163,0.800697,0.518318,0.791601,0.113356,0.118147,0.144173,0.167342,False,0.000000,...,0.518318,0.018986,0.518318,0.014160,0.018986,0.049033,0.374708,0.036569,0.496714,0.311678
48741,0.316237,0.740082,0.758271,0.524634,0.068462,0.123201,0.067416,0.085484,False,0.000000,...,0.758271,0.015239,0.758271,0.006512,0.015239,0.069138,0.257903,0.029543,0.427858,0.348934
48742,0.664367,0.521288,0.378138,0.634780,0.102132,0.091699,0.101124,0.186559,True,0.400000,...,0.378138,0.011210,0.378138,0.014286,0.011210,0.018076,0.265150,0.023037,0.253342,0.101109


## Теперь можно попробовать обучить новые бустинги с подбором гиперпараметров на обновленных фичах

In [None]:
import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Словарь для хранения лучших моделей
best_models = {}
# Определяем StratifiedKFold для кросс-валидации
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Функция для подбора гиперпараметров XGBoost с кросс-валидацией
def optimize_xgboost(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.2),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "subsample": trial.suggest_float("subsample", 0.6, 0.8),
        "min_child_weight": trial.suggest_int("min_child_weight", 3, 10),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.1, 5),
    }
    model = XGBClassifier(**params, random_state=42, n_estimators = 10000, use_label_encoder=False)
    scores = cross_val_score(model, X_filtered, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Функция для подбора гиперпараметров LightGBM с кросс-валидацией
def optimize_lightgbm(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 0, 10),
        "lambda_l2": trial.suggest_float("lambda_l2", 0, 10),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 1),
        "max_bin": trial.suggest_int("max_bin", 128, 512),
    }
    model = LGBMClassifier(**params, random_state=42, verbose=-1, n_estimators = 10000, boosting_type='goss', subsample_for_bin=250000)
    scores = cross_val_score(model, X_filtered, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Функция для подбора гиперпараметров CatBoost с кросс-валидацией
def optimize_catboost(trial):
    params = {
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
    }
    model = CatBoostClassifier(**params, random_state=42, verbose=0, iterations=10000)
    scores = cross_val_score(model, X_filtered, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return scores.mean()

# Оптимизация гиперпараметров для каждой модели с расширенными гиперпараметрами
for model_name, optimize_function in zip(
    ["LightGMB", "XGBoost", "CatBoost"],
    [optimize_lightgbm, optimize_xgboost, optimize_catboost]
):
    print(f"Оптимизация гиперпараметров для {model_name}...")
    study = optuna.create_study(direction="maximize")
    study.optimize(optimize_function, n_trials=9)  # Увеличьте количество испытаний при необходимости
    best_models[model_name] = {
        "best_params": study.best_params,
        "best_score": study.best_value,
    }

# Вывод лучших параметров и результатов
for model_name, results in best_models.items():
    print(f"\nМодель: {model_name}")
    print(f"Лучшие параметры: {results['best_params']}")
    print(f"Лучший ROC-AUC: {results['best_score']:.4f}")

[I 2025-01-21 01:32:50,241] A new study created in memory with name: no-name-83ff9f75-cccb-4313-aebb-89176a7f644d


Оптимизация гиперпараметров для LightGMB...


[I 2025-01-21 01:36:18,376] Trial 0 finished with value: 0.7338781880106366 and parameters: {'max_depth': 9, 'learning_rate': 0.2983105278088781, 'num_leaves': 83, 'feature_fraction': 0.6356414904616909, 'bagging_fraction': 0.9625765693427744, 'min_data_in_leaf': 55, 'lambda_l1': 7.632439159005329, 'lambda_l2': 2.3033125460387183, 'min_gain_to_split': 0.7566249959004848, 'max_bin': 194}. Best is trial 0 with value: 0.7338781880106366.
[I 2025-01-21 01:41:12,141] Trial 1 finished with value: 0.7430744511146768 and parameters: {'max_depth': 5, 'learning_rate': 0.1985429196409654, 'num_leaves': 49, 'feature_fraction': 0.8664932146614572, 'bagging_fraction': 0.5323302390181235, 'min_data_in_leaf': 27, 'lambda_l1': 9.545249512213804, 'lambda_l2': 3.2433594752980968, 'min_gain_to_split': 0.7419750794702296, 'max_bin': 294}. Best is trial 1 with value: 0.7430744511146768.
[I 2025-01-21 01:45:15,791] Trial 2 finished with value: 0.7469857674161471 and parameters: {'max_depth': 7, 'learning_rat

## Попробуем на обновленном датасете прогнать старый лучший бустинг

In [259]:
import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

params = {
    'boosting_type': 'goss',
    'n_estimators': 10000,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 1,
    'is_unbalance': False,
    'silent':-1,
    'verbose':-1
}


LightGMB = LGBMClassifier(**params, random_state=42)
LightGMB.fit(X_filtered, y)
preds = LightGMB.predict_proba(test_transformed_filtered)[:, 1]
submit = test_transformed[['SK_ID_CURR']]
submit['TARGET'] = preds

submit.to_csv('try_lgmc_280.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit['TARGET'] = preds


In [257]:
params = {
    'n_estimators': 10000,
    'max_depth': 7,
    'learning_rate': 0.13682269627837404,
    'colsample_bytree': 0.7908617037009442,
    'subsample': 0.7792250035241123,
    'gamma': 2.076205173343447,
    'min_child_weight': 3,
    'lambda': 7.308530881486179,
    'alpha': 5.411462036667684,
    'scale_pos_weight': 0.39871111893608546
}


XGBoost = XGBClassifier(**params, random_state=42)
XGBoost.fit(X_filtered, y)
preds2 = XGBoost.predict_proba(test_transformed_filtered)[:, 1]
submit2 = test_transformed[['SK_ID_CURR']]
submit2['TARGET'] = preds2

submit2.to_csv('try_xgboost_460.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit2['TARGET'] = preds2


In [258]:
submit_combined = pd.concat([submit, submit2]).groupby('SK_ID_CURR', as_index=False)['TARGET'].mean()
submit_combined.to_csv('try_mean_for_light_and_xg_10000.csv', index=False)

In [35]:
params = {
    'iterations': 296,
    'depth': 5,
    'learning_rate': 0.1954565936080041,
    'l2_leaf_reg': 5.480211513796368,
    'border_count': 38,
    'bagging_temperature': 0.19292086350514703,
    'random_strength': 9.929988629550547
}

CatBoost = CatBoostClassifier(**params, random_state=42)
CatBoost.fit(X, y)
preds3 = CatBoost.predict_proba(test_transformed)[:, 1]
submit3 = test_transformed[['SK_ID_CURR']]
submit3['TARGET'] = preds3
submit3.to_csv('try_catboost_460.csv', index=False)

0:	learn: 0.5150490	total: 258ms	remaining: 1m 15s
1:	learn: 0.4176697	total: 334ms	remaining: 49.1s
2:	learn: 0.3609229	total: 404ms	remaining: 39.4s
3:	learn: 0.3274364	total: 475ms	remaining: 34.7s
4:	learn: 0.3072304	total: 541ms	remaining: 31.5s
5:	learn: 0.2949081	total: 623ms	remaining: 30.1s
6:	learn: 0.2874496	total: 693ms	remaining: 28.6s
7:	learn: 0.2804675	total: 765ms	remaining: 27.6s
8:	learn: 0.2743896	total: 835ms	remaining: 26.6s
9:	learn: 0.2705832	total: 894ms	remaining: 25.6s
10:	learn: 0.2677875	total: 971ms	remaining: 25.2s
11:	learn: 0.2660452	total: 1.05s	remaining: 24.8s
12:	learn: 0.2634003	total: 1.14s	remaining: 24.8s
13:	learn: 0.2620334	total: 1.21s	remaining: 24.3s
14:	learn: 0.2606954	total: 1.28s	remaining: 24.1s
15:	learn: 0.2598843	total: 1.35s	remaining: 23.7s
16:	learn: 0.2594892	total: 1.44s	remaining: 23.6s
17:	learn: 0.2590828	total: 1.5s	remaining: 23.2s
18:	learn: 0.2585977	total: 1.6s	remaining: 23.3s
19:	learn: 0.2579345	total: 1.67s	remainin

161:	learn: 0.2361848	total: 12.7s	remaining: 10.5s
162:	learn: 0.2361161	total: 12.8s	remaining: 10.4s
163:	learn: 0.2360815	total: 12.8s	remaining: 10.3s
164:	learn: 0.2360085	total: 12.9s	remaining: 10.3s
165:	learn: 0.2359592	total: 13s	remaining: 10.2s
166:	learn: 0.2358764	total: 13.1s	remaining: 10.1s
167:	learn: 0.2358347	total: 13.1s	remaining: 10s
168:	learn: 0.2357669	total: 13.2s	remaining: 9.92s
169:	learn: 0.2357002	total: 13.3s	remaining: 9.84s
170:	learn: 0.2356335	total: 13.4s	remaining: 9.76s
171:	learn: 0.2356115	total: 13.4s	remaining: 9.68s
172:	learn: 0.2355830	total: 13.5s	remaining: 9.59s
173:	learn: 0.2355263	total: 13.6s	remaining: 9.51s
174:	learn: 0.2354753	total: 13.6s	remaining: 9.43s
175:	learn: 0.2353974	total: 13.7s	remaining: 9.35s
176:	learn: 0.2353519	total: 13.8s	remaining: 9.26s
177:	learn: 0.2352995	total: 13.8s	remaining: 9.18s
178:	learn: 0.2352523	total: 13.9s	remaining: 9.09s
179:	learn: 0.2352048	total: 14s	remaining: 9s
180:	learn: 0.2351463

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit3['TARGET'] = preds3


In [71]:
submit_combined = pd.concat([submit, submit2]).groupby('SK_ID_CURR', as_index=False)['TARGET'].mean()
submit_combined.to_csv('try_mean_for_light_and_xg_.csv', index=False)

## Подготовка датасета для трейна стейкинга (состоит из базовых предсказаний бустингов на тесте)

In [22]:
# Предсказания трех лучших бустингов на трейне
preds_for_stacking = LightGMB.predict_proba(X)[:, 1]
preds2_for_stacking = XGBoost.predict_proba(X)[:, 1]
preds3_for_stacking = CatBoost.predict_proba(X)[:, 1]
preds_4_for_stacking = (preds_for_stacking + preds2_for_stacking + preds3_for_stacking) / 3

In [26]:
# Датасет с предсказаниями трейна тремя лучшими бустингами + их средним
stacking_data = {
    'preds_for_stacking': preds_for_stacking,
    'preds2_for_stacking': preds2_for_stacking,
    'preds3_for_stacking': preds3_for_stacking,
    'preds_4_for_stacking': preds_4_for_stacking,
    'target': y
}
stacking = pd.DataFrame(stacking_data)
stacking

Unnamed: 0,preds_for_stacking,preds2_for_stacking,preds3_for_stacking,preds_4_for_stacking,target
0,0.347328,0.192332,0.432977,0.324212,1
1,0.012712,0.004395,0.013586,0.010231,0
2,0.020901,0.007414,0.021203,0.016506,0
3,0.046825,0.014026,0.048443,0.036431,0
4,0.052801,0.020240,0.077285,0.050109,0
...,...,...,...,...,...
307506,0.095458,0.031436,0.097728,0.074874,0
307507,0.110729,0.044234,0.120359,0.091774,0
307508,0.067836,0.019920,0.046692,0.044816,0
307509,0.055647,0.031881,0.060036,0.049188,1


## Обучение стейкинга на основе бустинга + подбор гиперпараметров

In [24]:
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score
from lightgbm import LGBMClassifier

# Предполагаем, что X_stack и y_stack уже определены
X_stack = stacking[['preds_for_stacking', 'preds2_for_stacking', 'preds3_for_stacking', 'preds_4_for_stacking']]
y_stack = stacking['target']

# Определяем функцию для оптимизации
def objective(trial):
    # Подбираем гиперпараметры для LGBMClassifier
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),  # Используем suggest_float с log=True
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'num_leaves': trial.suggest_int('num_leaves', 7, 127),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Замена suggest_uniform
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),  # Замена suggest_uniform
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10, log=True),  # Используем suggest_float с log=True
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10, log=True),  # Используем suggest_float с log=True
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 0, 20),
        "lambda_l2": trial.suggest_float("lambda_l2", 0, 20),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 1),
        "max_bin": trial.suggest_int("max_bin", 64, 512),
    }
    
    # Создаем модель LightGBM
    model_stacking = LGBMClassifier(**params, random_state=42)
    
    # StratifiedKFold для кросс-валидации
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # Оценка через cross_val_score
    scores = cross_val_score(model_stacking, X_stack, y_stack, cv=cv, scoring="roc_auc", n_jobs=-1)
    
    # Средний ROC-AUC по всем фолдам
    return scores.mean()

# Создаем исследование
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Выводим лучшие параметры и метрику
print("Best hyperparameters:", study.best_params)
print("Best mean ROC-AUC across folds:", study.best_value)

# Финальная модель с лучшими параметрами
best_params = study.best_params
#final_model = LGBMClassifier(**best_params, random_state=42)
#final_model.fit(X_stack, y_stack)  # Обучаем на всех данных

[I 2025-01-20 15:14:31,211] A new study created in memory with name: no-name-98aeb84f-1fc6-4bc9-b74a-8e1789e941f0
[I 2025-01-20 15:14:42,489] Trial 0 finished with value: 0.8275969364819111 and parameters: {'learning_rate': 0.09352059849280826, 'n_estimators': 150, 'max_depth': 15, 'num_leaves': 47, 'min_child_samples': 15, 'subsample': 0.7530969374363632, 'colsample_bytree': 0.42893304533991994, 'reg_alpha': 7.752679615616853, 'reg_lambda': 0.002276663306659333, 'feature_fraction': 0.5422273162181994, 'bagging_fraction': 0.9476835065134567, 'min_data_in_leaf': 21, 'lambda_l1': 11.97606262046526, 'lambda_l2': 1.4049290265939574, 'min_gain_to_split': 0.17810574228354104, 'max_bin': 294}. Best is trial 0 with value: 0.8275969364819111.
[I 2025-01-20 15:14:50,407] Trial 1 finished with value: 0.8279695067705937 and parameters: {'learning_rate': 0.08208150439785869, 'n_estimators': 192, 'max_depth': 13, 'num_leaves': 122, 'min_child_samples': 64, 'subsample': 0.6792111501798105, 'colsample

Best hyperparameters: {'learning_rate': 0.10318634305764093, 'n_estimators': 398, 'max_depth': 5, 'num_leaves': 89, 'min_child_samples': 79, 'subsample': 0.9640552621083184, 'colsample_bytree': 0.8711390944898216, 'reg_alpha': 0.0022840689108970507, 'reg_lambda': 0.16455624112025888, 'feature_fraction': 0.6261370369404968, 'bagging_fraction': 0.8647324154903343, 'min_data_in_leaf': 34, 'lambda_l1': 16.765668911796702, 'lambda_l2': 11.517455937367343, 'min_gain_to_split': 0.1573226475621463, 'max_bin': 462}
Best mean ROC-AUC across folds: 0.8280081399578936


Best hyperparameters: {'learning_rate': 0.03156394702578222, 'n_estimators': 397, 'max_depth': 13, 'num_leaves': 96, 'min_child_samples': 57, 'subsample': 0.6102982842590267, 'colsample_bytree': 0.8710165287288435, 'reg_alpha': 1.0123557794141835, 'reg_lambda': 0.022164215507268024, 'feature_fraction': 0.9930818732819482, 'bagging_fraction': 0.7664288423497708, 'min_data_in_leaf': 84, 'lambda_l1': 17.611202987199025, 'lambda_l2': 4.898289118395143, 'min_gain_to_split': 0.12332205366938159, 'max_bin': 143}
Best mean ROC-AUC across folds: 0.8282095695058395

## Обучение стейкинга на основе случайного леса + подбор гиперпараметров

In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Предполагаем, что X_stack и y_stack уже определены
X_stack = stacking[['preds_for_stacking', 'preds2_for_stacking', 'preds3_for_stacking', 'preds_4_for_stacking']]
y_stack = stacking['target']

# Определяем функцию для оптимизации
def objective(trial):
    # Подбираем гиперпараметры для RandomForestClassifier
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20)
    }
    
    # Создаем модель Random Forest
    model_stacking_forest = RandomForestClassifier(**params, random_state=42)
    
    # StratifiedKFold для кросс-валидации
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # Оценка через cross_val_score
    scores = cross_val_score(model_stacking_forest, X_stack, y_stack, cv=cv, scoring="roc_auc", n_jobs=-1)
    
    # Средний ROC-AUC по всем фолдам
    return scores.mean()

# Создаем исследование
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Выводим лучшие параметры и метрику
print("Best hyperparameters:", study.best_params)
print("Best mean ROC-AUC across folds:", study.best_value)

# Финальная модель с лучшими параметрами
best_params = study.best_params
#final_model = RandomForestClassifier(**best_params, random_state=42)
#final_model.fit(X_stack, y_stack)  # Обучаем на всех данных

[I 2025-01-20 15:16:42,573] A new study created in memory with name: no-name-46297245-7b00-43ef-b63e-6594d52bfe63
[I 2025-01-20 15:19:55,002] Trial 0 finished with value: 0.826609498487479 and parameters: {'n_estimators': 206, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.826609498487479.
[I 2025-01-20 15:28:52,277] Trial 1 finished with value: 0.8237021443669942 and parameters: {'n_estimators': 282, 'max_depth': 16, 'min_samples_split': 14, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.826609498487479.
[I 2025-01-20 15:29:58,175] Trial 2 finished with value: 0.8197778925934223 and parameters: {'n_estimators': 128, 'max_depth': 3, 'min_samples_split': 16, 'min_samples_leaf': 17}. Best is trial 0 with value: 0.826609498487479.
[I 2025-01-20 15:33:52,325] Trial 3 finished with value: 0.8231991383950333 and parameters: {'n_estimators': 357, 'max_depth': 4, 'min_samples_split': 12, 'min_samples_leaf': 17}. Best is trial 0 with value:

## датасет Stacking на тесте (состоит из базовых предсказаний бустингов на тесте)

In [24]:
preds4 = (preds + preds2 + preds3) / 3
stacking_data_test = {
    'preds_for_stacking': preds,
    'preds2_for_stacking': preds2,
    'preds3_for_stacking': preds3,
    'preds_4_for_stacking': preds4,
}
stacking_test = pd.DataFrame(stacking_data_test)
stacking_test

Unnamed: 0,preds_for_stacking,preds2_for_stacking,preds3_for_stacking,preds_4_for_stacking
0,0.044847,0.019078,0.038463,0.034129
1,0.108711,0.049422,0.133623,0.097252
2,0.022417,0.021791,0.018742,0.020983
3,0.042300,0.014631,0.040104,0.032345
4,0.140459,0.070118,0.137821,0.116133
...,...,...,...,...
48739,0.071623,0.033287,0.080607,0.061839
48740,0.058867,0.021907,0.074897,0.051890
48741,0.011917,0.006119,0.013083,0.010373
48742,0.056238,0.018681,0.051603,0.042174


## Фиксируем лучшие найденные параметры и с ними обучаемся и делаем предикт

In [30]:
params = {
    'learning_rate': 0.03156394702578222,
    'n_estimators': 397,
    'max_depth': 13,
    'num_leaves': 96,
    'min_child_samples': 57,
    'subsample': 0.6102982842590267,
    'colsample_bytree': 0.8710165287288435,
    'reg_alpha': 1.0123557794141835,
    'reg_lambda': 0.022164215507268024,
    'feature_fraction': 0.9930818732819482,
    'bagging_fraction': 0.7664288423497708,
    'min_data_in_leaf': 84,
    'lambda_l1': 17.611202987199025,
    'lambda_l2': 4.898289118395143,
    'min_gain_to_split': 0.12332205366938159,
    'max_bin': 143
}

X_stack = stacking[['preds_for_stacking', 'preds2_for_stacking', 'preds3_for_stacking', 'preds_4_for_stacking']]
y_stack = stacking['target']

LightGMB_stacking = LGBMClassifier(**params, random_state=42)
LightGMB_stacking.fit(X_stack, y_stack)
preds_stacking_boosting = LightGMB_stacking.predict_proba(stacking_test)[:, 1]
submit_stacking_boosting = test_transformed[['SK_ID_CURR']]
submit_stacking_boosting['TARGET'] = preds_stacking_boosting
submit_stacking_boosting

submit_stacking_boosting.to_csv('try_stacking_on_boosting.csv', index=False)

[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001009 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 572
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486








A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_stacking_boosting['TARGET'] = preds_stacking_boosting


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.045078
1,100005,0.096982
2,100013,0.044923
3,100028,0.019689
4,100038,0.140435
...,...,...
48739,456221,0.070167
48740,456222,0.032642
48741,456223,0.009602
48742,456224,0.033225
