In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [18]:
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
import catboost as cb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb

warnings.filterwarnings('ignore')

# Configure pandas display settings for better data visualization
pd.options.display.max_columns = None

def preprocess_numerical_features(df):
    """
    Handle numerical features by converting strings to numbers and imputing missing values.
    Args:
        df: Input DataFrame
    Returns:
        DataFrame with processed numerical features
    """
    # Fill missing values with median
    df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].median())
    df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].median())

    # Convert to appropriate numeric types
    df['person_emp_length'] = df['person_emp_length'].astype(int)
    df['loan_int_rate'] = (df['loan_int_rate'] * 100).astype(int)
    df['loan_percent_income'] = (df['loan_percent_income'] * 100).astype(int)

    return df

def encode_categorical_features(df):
    """
    Convert categorical variables to appropriate data types and encode them.
    Args:
        df: Input DataFrame
    Returns:
        DataFrame with encoded categorical features
    """
    categorical_mappings = {
        'person_home_ownership': {'RENT': 0, 'MORTGAGE': 1, 'OWN': 2, 'OTHER': 3},
        'loan_intent': {'EDUCATION': 0, 'MEDICAL': 1, 'PERSONAL': 2, 'VENTURE': 3,
                       'DEBTCONSOLIDATION': 4, 'HOMEIMPROVEMENT': 5},
        'loan_grade': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6},
        'cb_person_default_on_file': {'N': 0, 'Y': 1}
    }

    for col, mapping in categorical_mappings.items():
        df[col] = df[col].replace(mapping)

    return df

def engineer_credit_features(df):
    """
    Create advanced features for credit risk assessment.
    Args:
        df: Input DataFrame
    Returns:
        DataFrame with additional engineered features
    """
    # Calculate financial ratios and interactions with safety checks
    df['debt_to_income'] = np.clip(
        ((df['loan_amnt'] / df['person_income']) - df['loan_percent_income']),
        -1e6, 1e6
    )
    df['age_income_factor'] = np.clip(
        (df['person_age'] * df['person_income']),
        0, 1e9
    )
    # Avoid division by zero in employment ratio
    df['loan_employment_ratio'] = np.clip(
        df['loan_amnt'] / df['person_emp_length'].replace(0, 1),
        0, 1e6
    )

    # Monthly financial calculations with safety checks
    monthly_income = np.maximum(df['person_income'] / 12, 1)  # Ensure no zero division
    df['monthly_loan_payment'] = np.clip(
        (df['loan_amnt'] * (1 + df['loan_int_rate']/100) / 12),
        0, 1e6
    )
    df['monthly_debt_ratio'] = np.clip(
        (df['monthly_loan_payment'] / monthly_income),
        0, 1e2
    )

    # Risk assessment feature
    df['credit_risk_indicator'] = np.where(
        (df['cb_person_default_on_file'] == 1) &
        (df['loan_grade'].isin([2, 3, 4])),
        1, 0
    )

    # Normalize loan-to-income ratio with safety check
    df["normalized_loan_income_ratio"] = np.clip(
        ((df["loan_amnt"] / np.maximum(df["person_income"], 1)) * 1000000),
        0, 1e9
    ).astype(int)

    return df

# Load datasets
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')
supplementary_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/credit_risk_dataset.csv')
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sample_submission.csv')


# Initial data preparation
train = pd.concat([train, supplementary_data], ignore_index=True)

# Apply preprocessing steps
train = preprocess_numerical_features(train)
train = encode_categorical_features(train)
train = engineer_credit_features(train)

test = preprocess_numerical_features(test)
test = encode_categorical_features(test)
test = engineer_credit_features(test)

# Prepare features and target
X = train.drop(['loan_status', 'id'], axis=1)
y = train['loan_status']
test = test.drop(['id'], axis=1)

# Define model parameters
catboost_params = {
    'task_type': "GPU",  # Use GPU acceleration
    'devices': '0',  # Use first GPU
    'loss_function': 'Logloss',
    'eval_metric': "AUC",
    'iterations': 5000,
    'learning_rate': 0.045,
    'max_depth': 7,
    'l2_leaf_reg': 0.80,
    'min_data_in_leaf': 30,
    'random_state': 42,
    'early_stopping_rounds': 200,
    'use_best_model': False,
    'allow_writing_files': False,
    'bootstrap_type': 'Bernoulli',  # GPU-compatible bootstrap type
    'subsample': 0.8,  # Instead of bagging_temperature
    'gpu_ram_part': 0.95  # Use 95% of GPU memory
}

lgbm_params = {
    'n_estimators': 1779,
    'learning_rate': 0.018971107281482297,
    'num_leaves': 100,
    'max_depth': 13,
    'min_child_samples': 6,
    'subsample': 0.9557780001541148,
    'colsample_bytree': 0.6125766049856997,
    'reg_alpha': 2.2391090634191215,
    'reg_lambda': 0.023135194031906982,
    'random_state': 42,
    'n_jobs': -1  # Use all CPU cores
}

xgb_params = {
    'max_depth': 9,
    'learning_rate': 0.09763827013301708,
    'n_estimators': 929,
    'min_child_weight': 2,
    'gamma': 0.0737000842530037,
    'subsample': 0.999299053610849,
    'colsample_bytree': 0.7331921391500718,
    'reg_alpha': 1.5593266144989077,
    'reg_lambda': 2.2285698685927673,
    'scale_pos_weight': 1.4106632040101776,
    'tree_method': 'gpu_hist',  # Use GPU histogram method
    'device': 'cuda',  # Use CUDA for GPU acceleration
    'random_state': 42
}

# Data validation and cleaning
def clean_infinite_values(df):
    """
    Clean infinite values from dataframe and ensure all numeric columns are within reasonable bounds
    Args:
        df: Input DataFrame
    Returns:
        DataFrame with cleaned values
    """
    # Replace infinities with NaN first
    df = df.replace([np.inf, -np.inf], np.nan)

    # For each numeric column, clip values to reasonable bounds
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = np.clip(df[col].fillna(df[col].median()), -1e9, 1e9)

    return df

# Clean data before training
print("Cleaning training data...")
X = clean_infinite_values(X)
print("Cleaning test data...")
test = clean_infinite_values(test)

# Train models
print("Training CatBoost model...")
catboost_model = CatBoostClassifier(**catboost_params)
catboost_model.fit(X, y)

print("Training LightGBM model...")
lgbm_model = LGBMClassifier(**lgbm_params)
lgbm_model.fit(X, y)

print("Training XGBoost model...")
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_model.fit(X, y)

# Generate predictions
pred_catboost = catboost_model.predict_proba(test)[:, 1]
pred_lgbm = lgbm_model.predict_proba(test)[:, 1]
pred_xgb = xgb_model.predict_proba(test)[:, 1]

# Create ensemble predictions
submission['loan_status'] = (pred_catboost + pred_lgbm + pred_xgb) / 3
submission.to_csv('submission.csv', index=False)


Cleaning training data...
Cleaning test data...
Training CatBoost model...


Default metric period is 5 because AUC is/are not implemented for GPU


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
10:	total: 69.3ms	remaining: 31.4s
11:	total: 75.4ms	remaining: 31.4s
12:	total: 81.6ms	remaining: 31.3s
13:	total: 87.7ms	remaining: 31.2s
14:	total: 93.8ms	remaining: 31.2s
15:	total: 100ms	remaining: 31.3s
16:	total: 107ms	remaining: 31.3s
17:	total: 113ms	remaining: 31.2s
18:	total: 119ms	remaining: 31.2s
19:	total: 125ms	remaining: 31.2s
20:	total: 132ms	remaining: 31.2s
21:	total: 138ms	remaining: 31.2s
22:	total: 144ms	remaining: 31.2s
23:	total: 150ms	remaining: 31.1s
24:	total: 156ms	remaining: 31.1s
25:	total: 163ms	remaining: 31.1s
26:	total: 169ms	remaining: 31.2s
27:	total: 176ms	remaining: 31.3s
28:	total: 183ms	remaining: 31.3s
29:	total: 190ms	remaining: 31.4s
30:	total: 197ms	remaining: 31.6s
31:	total: 203ms	remaining: 31.6s
32:	total: 210ms	remaining: 31.6s
33:	total: 216ms	remaining: 31.6s
34:	total: 223ms	remaining: 31.6s
35:	total: 229ms	remaining: 31.5s
36:	total: 235ms	remaining: 3

# Новый раздел