In [1]:
!pip install catboost lightgbm



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

print("Шаг 1: Загрузка данных...")
train_df = pd.read_csv("/kaggle/input/mai-ml-lab-2-308/train_c.csv")
test_df = pd.read_csv("/kaggle/input/mai-ml-lab-2-308/test_c.csv")
test_ids = test_df['ID']

train_df = train_df.drop_duplicates()
train_df = train_df.loc[(train_df['Experience'].notna()) & (train_df['Experience'] >= 0) & (train_df['Experience'] <= 45)].copy()
train_df = train_df.loc[(train_df['Age'].notna()) & (train_df['Age'] >= 0) & (train_df['Age'] <= 75)].copy()

X = train_df.drop(['LoanApproved'], axis=1)
y = train_df['LoanApproved']
X_test = test_df.drop('ID', axis=1)

Шаг 1: Загрузка данных...


In [3]:
print("Feature engineering...")

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.median_date_ = None
        self.first_date_ = None
        self.mode_vals_ = None
        self.high_debt_threshold_ = None
        self.train_columns_ = None

    def fit(self, X, y=None):
        X_ = X.copy()
        temp_dates = pd.to_datetime(X_['ApplicationDate'], errors='coerce')
        self.median_date_ = pd.to_datetime(temp_dates.dropna().astype('int64').median())
        self.first_date_ = temp_dates.min()
        onehot_cols = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus']
        self.mode_vals_ = {col: X_[col].mode()[0] for col in ['EducationLevel'] + onehot_cols}
        if 'DebtToIncomeRatio' in X_.columns:
            self.high_debt_threshold_ = X_['DebtToIncomeRatio'].quantile(0.75)
        self.train_columns_ = self.transform(X).columns
        return self

    def transform(self, X, y=None):
        df = X.copy()
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(0)
        for col, mode in self.mode_vals_.items():
            if col in df.columns:
                df[col] = df[col].fillna(mode)
        df['AnnualIncome_log'] = np.log1p(np.clip(df['AnnualIncome'], 0, None))
        df['LoanAmount_log'] = np.log1p(np.clip(df['LoanAmount'], 0, None))
        df['InterestRate_log'] = np.log1p(np.clip(df['InterestRate'], 0, None))
        df['TotalDebtToIncomeRatio_log'] = np.log1p(np.clip(df['TotalDebtToIncomeRatio'], 0, None))
        df['CreditCardUtilizationRate'] = np.sqrt(np.clip(df['CreditCardUtilizationRate'], 0, None))
        df['CreditScore_sqrt'] = np.sqrt(np.clip(df['CreditScore'], 0, None))
        df['Income_to_Assets'] = df['MonthlyIncome'] / (df['TotalAssets'] + 1)
        df['Debt_to_IncomeRate'] = df['TotalDebtToIncomeRatio'] * df['InterestRate']
        df['Income_Sqrt'] = np.sqrt(np.clip(df['AnnualIncome_log'], 0, None))
        df['Income_per_Debt'] = df['AnnualIncome_log'] / (df['TotalDebtToIncomeRatio'] + 1)
        df['DebtToIncome_x_Interest'] = df['DebtToIncomeRatio'] * df['InterestRate']
        df['CreditScore_Income'] = df['CreditScore'] * df['AnnualIncome_log']
        df['Incomedif'] = df['AnnualIncome_log'] / 12 - df['MonthlyIncome']
        df['TotalAssets_sqrt'] = np.sqrt(np.clip(df['TotalAssets'], 0, None))
        df['MonthlyDebtPayments_sqrt'] = np.sqrt(np.clip(df['MonthlyDebtPayments'], 0, None))
        pos_skewed = ['TotalLiabilities', 'CheckingAccountBalance', 'SavingsAccountBalance']
        for col in pos_skewed:
            df[col + '_log'] = np.log1p(np.clip(df[col], 0, None))
        top_features = ['CreditScore', 'AnnualIncome', 'LoanAmount', 'InterestRate', 'DebtToIncomeRatio', 'LengthOfCreditHistory', 'MonthlyIncome']
        for feature in top_features:
            if feature in df.columns:
                df[f'{feature}_sq'] = df[feature] ** 2
                df[f'{feature}_cb'] = df[feature] ** 3
        df['Savings_to_Debt_Ratio'] = df['SavingsAccountBalance'] / (df['MonthlyDebtPayments'] + 1)
        if 'Age' in df.columns and 'LengthOfCreditHistory' in df.columns:
            df['CreditHistory_to_Age_Ratio'] = df['LengthOfCreditHistory'] / ((df['Age'] - 18) * 12 + 1)
        df['Loan_to_Income_Ratio'] = df['LoanAmount_log'] / (df['AnnualIncome_log'] + 1)
        df['DSR'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)
        df['Disposable_Income'] = df['MonthlyIncome'] - df['MonthlyDebtPayments']
        df['Liabilities_to_Assets_Ratio'] = df['TotalLiabilities'] / (df['TotalAssets'] + 1)
        df['Credit_Lines_Utilization'] = df['NumberOfOpenCreditLines'] / (df['LengthOfCreditHistory'] + 1)
        df['Inquiries_per_Year'] = df['NumberOfCreditInquiries'] / (df['LengthOfCreditHistory'] / 12 + 1)
        df['Is_New_Customer'] = (df['LengthOfCreditHistory'] < 24).astype(int)
        df['History_x_Defaults'] = df['LengthOfCreditHistory'] * df['PreviousLoanDefaults']
        df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'], errors='coerce').fillna(self.median_date_)
        df['ApplicationDayOfYear'] = df['ApplicationDate'].dt.dayofyear
        df['ApplicationWeekOfYear'] = df['ApplicationDate'].dt.isocalendar().week.astype('Int64')
        df['ApplicationYear'] = df['ApplicationDate'].dt.year
        df['DaysSinceStart'] = (df['ApplicationDate'] - self.first_date_).dt.days
        df['Is_Month_Start'] = df['ApplicationDate'].dt.is_month_start.astype(int)
        df['Is_Month_End'] = df['ApplicationDate'].dt.is_month_end.astype(int)
        if 'Age' in df.columns:
            df['Age_sq_smart'] = df['Age'] ** 2
        if 'DebtToIncomeRatio' in df.columns and self.high_debt_threshold_ is not None:
            df['is_high_debt'] = (df['DebtToIncomeRatio'] > self.high_debt_threshold_).astype(int)
            if 'CreditScore' in df.columns:
                df['CreditScore_x_HighDebt'] = df['CreditScore'] * df['is_high_debt']
        if 'NumberOfCreditInquiries' in df.columns and 'PreviousLoanDefaults' in df.columns:
            df['Inquiries_x_Defaults'] = df['NumberOfCreditInquiries'] * df['PreviousLoanDefaults']
        df = df.drop(columns=['ApplicationDate'], errors='ignore')
        if self.train_columns_ is not None:
            missing_cols = set(self.train_columns_) - set(df.columns)
            for c in missing_cols:
                df[c] = 0
            df = df[self.train_columns_]
        return df

Feature engineering...


In [4]:
print("\nСборка пайплайна предобработки...")

education_level_col = ['EducationLevel']
credit_score_col = ['CreditScore']
onehot_cols = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus']
numeric_features = [col for col in X.columns if col not in education_level_col + credit_score_col + onehot_cols + ['ApplicationDate']]

preprocessor = ColumnTransformer(
    transformers=[
        ('education', OrdinalEncoder(categories=[['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']], handle_unknown='use_encoded_value', unknown_value=-1), education_level_col),
        ('credit_score', PowerTransformer(method='yeo-johnson'), credit_score_col),
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'), onehot_cols)
    ],
    remainder='passthrough'
)


Сборка пайплайна предобработки...


In [5]:
print("\nШаг 4: Обучение стекинга и подбор гиперпараметров...")

estimators = [
    ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, random_state=42, verbose=0)),
    ('lightgbm', LGBMClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, verbosity=-1)),
    ('logreg', LogisticRegression(C=0.91, penalty='l1', random_state=42, solver='liblinear', max_iter=1000))
]

final_estimator = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5, n_jobs=-1)

pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor),
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('classifier', stacking_model)
])

param_distributions = {
    'classifier__catboost__iterations': [150, 200, 250],
    'classifier__catboost__depth': [4, 6, 8],
    'classifier__catboost__learning_rate': [0.05, 0.1, 0.15],
    'classifier__lightgbm__n_estimators': [150, 200, 250],
    'classifier__lightgbm__max_depth': [4, 6, 8],
    'classifier__lightgbm__learning_rate': [0.05, 0.1, 0.15],
    'classifier__logreg__C': [0.5, 0.91, 1.5]
}

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=2,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X, y)

print("Лучшая комбинация гиперпараметров:", search.best_params_)
print(f"Лучший ROC-AUC на CV (RandomizedSearchCV): {search.best_score_:.4f}")

best_model = search.best_estimator_


Шаг 4: Обучение стекинга и подбор гиперпараметров...
Fitting 2 folds for each of 10 candidates, totalling 20 fits
Лучшая комбинация гиперпараметров: {'classifier__logreg__C': 0.5, 'classifier__lightgbm__n_estimators': 150, 'classifier__lightgbm__max_depth': 6, 'classifier__lightgbm__learning_rate': 0.15, 'classifier__catboost__learning_rate': 0.15, 'classifier__catboost__iterations': 250, 'classifier__catboost__depth': 8}
Лучший ROC-AUC на CV (RandomizedSearchCV): 0.9851


In [6]:
print("\nШаг 5: Оценка лучшей модели и создание прогнозов...")

cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='roc_auc', n_jobs=-1)
print(f"ROC-AUC на CV (лучшая модель): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

predictions = best_model.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame({'ID': test_ids, 'LoanApproved': predictions})
submission_df.to_csv('submission_stacking_3_models.csv', index=False)
print("\nФайл 'submission_stacking_3_models.csv' успешно создан.")


Шаг 5: Оценка лучшей модели и создание прогнозов...
ROC-AUC на CV (лучшая модель): 0.9857 ± 0.0005

Файл 'submission_stacking_3_models.csv' успешно создан.
