In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Улучшенный RNN

In [3]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE

In [4]:
# Функция Focal Loss для учета классового дисбаланса
def focal_loss(alpha=0.25, gamma=2.0):
    def loss(y_true, y_pred):
        y_pred = K.clip(y_pred, 1e-7, 1 - 1e-7)
        loss = -y_true * alpha * K.pow(1 - y_pred, gamma) * K.log(y_pred) - \
               (1 - y_true) * (1 - alpha) * K.pow(y_pred, gamma) * K.log(1 - y_pred)
        return K.mean(loss)
    return loss

In [5]:
# Укажем путь к данным
DATA_PATH = r"/content/drive/MyDrive/home-credit-default-risk"

# Загружаем данные
app_train = pd.read_csv(os.path.join(DATA_PATH, "application_train.csv"))

# Выбираем только числовые признаки
num_features = app_train.select_dtypes(include=[np.number]).drop(columns=["TARGET", "SK_ID_CURR"])

# Заполняем пропущенные значения медианными
imputer = SimpleImputer(strategy='median')
num_features_imputed = pd.DataFrame(imputer.fit_transform(num_features), columns=num_features.columns)

# Добавляем новые признаки
num_features_imputed['CREDIT_INCOME_RATIO'] = num_features_imputed['AMT_CREDIT'] / (num_features_imputed['AMT_INCOME_TOTAL'] + 1)
num_features_imputed['ANNUITY_INCOME_RATIO'] = num_features_imputed['AMT_ANNUITY'] / (num_features_imputed['AMT_INCOME_TOTAL'] + 1)
num_features_imputed['DAYS_EMPLOYED_RATIO'] = num_features_imputed['DAYS_EMPLOYED'] / (num_features_imputed['DAYS_BIRTH'] + 1)

In [6]:
# Масштабируем данные
scaler = StandardScaler()
X = scaler.fit_transform(num_features_imputed)
y = app_train["TARGET"]

In [7]:
# Балансировка данных с SMOTE
smote = SMOTE(sampling_strategy=0.3, random_state=42)  # Увеличиваем класс 1 до 30% от класса 0
X_resampled, y_resampled = smote.fit_resample(X, y)

In [8]:
# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

In [9]:
# Создание улучшенной нейронной сети
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Выходной слой для бинарной классификации
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# Компиляция модели с Focal Loss
model.compile(optimizer='adam', loss=focal_loss(alpha=0.25, gamma=2.0), metrics=['AUC'])

In [11]:
# Обучение модели
class_weight = {0: 1, 1: 10}  # Увеличиваем вес класса 1
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=512, class_weight=class_weight, verbose=1)

Epoch 1/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 28ms/step - AUC: 0.6399 - loss: 0.2156 - val_AUC: 0.7588 - val_loss: 0.0463
Epoch 2/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - AUC: 0.7295 - loss: 0.1499 - val_AUC: 0.7697 - val_loss: 0.0454
Epoch 3/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - AUC: 0.7482 - loss: 0.1444 - val_AUC: 0.7814 - val_loss: 0.0445
Epoch 4/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - AUC: 0.7626 - loss: 0.1405 - val_AUC: 0.7919 - val_loss: 0.0439
Epoch 5/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - AUC: 0.7762 - loss: 0.1381 - val_AUC: 0.8009 - val_loss: 0.0432
Epoch 6/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - AUC: 0.7797 - loss: 0.1376 - val_AUC: 0.8031 - val_loss: 0.0434
Epoch 7/30
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1

In [12]:
# Предсказания
y_prob = model.predict(X_test).flatten()
y_pred = (y_prob > 0.5).astype(int)

[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


In [13]:
# Оценка качества модели
auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC Score (Improved Neural Network): {auc_score:.4f}")
print(classification_report(y_test, y_pred))

AUC-ROC Score (Improved Neural Network): 0.8683
              precision    recall  f1-score   support

           0       0.81      1.00      0.89     56538
           1       0.96      0.23      0.37     16961

    accuracy                           0.82     73499
   macro avg       0.89      0.61      0.63     73499
weighted avg       0.85      0.82      0.77     73499

