### Score 0.91375 ( max 0.92939 )
### Place: 2706 / 3724 ( top 27% )

# Evaluation

- Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.trial import Trial
from optuna.visualization import plot_param_importances, plot_parallel_coordinate
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from utils import EarlyStoppingCallback

In [2]:
sns.set_style('whitegrid')

In [3]:
dtype = {
    'annual_income': 'float32',
    'debt_to_income_ratio': 'float32',
    'credit_score': 'uint32',
    'loan_amount': 'float32',
    'interest_rate': 'float32',
    'gender': 'category',
    'martial_status': 'category',
    'education_level': 'category',
    'employment_status': 'category',
    'loan_purpose': 'category',
    'grade_subgrade': 'category',
}

In [4]:
test = pd.read_csv('./data/loan_back/test.csv', index_col='id', dtype=dtype)
train = pd.read_csv('./data/loan_back/train.csv', index_col='id', dtype=dtype)

In [5]:
train.head(5)

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,29367.990234,0.084,736,2528.419922,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,22108.019531,0.166,636,4593.100098,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.199219,0.097,694,17005.150391,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.47998,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.699219,0.053,665,12184.429688,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.hist(figsize=(12, 10), bins=50, color='steelblue', edgecolor='black')
plt.tight_layout()
plt.grid(False)
plt.show()

In [None]:
# let's visualize all categorical features by circle plots
categorical_cols = train.select_dtypes(include=['category']).columns.tolist()
n_cols = 3
n_rows = int(np.ceil(len(categorical_cols) / n_cols))
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
for i, col in enumerate(categorical_cols):
    ax = axes[i // n_cols, i % n_cols]
    train[col].value_counts().plot.pie(ax=ax, autopct='%1.1f%%', startangle=90, counterclock=False)
    ax.set_title(col)
    ax.set_ylabel('')
plt.tight_layout()
plt.show()

In [6]:
target_col = 'loan_paid_back'
X, y = train.drop(columns=target_col), train[target_col]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
one_hot_columns = ['gender', 'loan_purpose', 'grade_subgrade', 'employment_status']
categorical_columns = ['grade_subgrade', 'marital_status', 'education_level']
numerical_columns = test.select_dtypes(include=['number']).columns.tolist()

In [8]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

one_hot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('onehot', one_hot_transformer, one_hot_columns),
        ('cat', categorical_transformer, categorical_columns),
    ])

In [None]:
estimator = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

In [None]:
model = make_pipeline(
    preprocessor,
    estimator
)

In [None]:
model.fit(X_train, y_train)

In [None]:
roc_score = roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1])
print(f'ROC AUC Score: {roc_score}')

In [None]:
classifer = model.named_steps['randomforestclassifier']
importances = classifer.feature_importances_
feature_names = model.named_steps['columntransformer'].get_feature_names_out()
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print('Total num features: ', len(feature_importances))
feature_importances

In [12]:
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)

In [13]:
X_train_processed = pd.DataFrame(X_train_processed.toarray(), columns=preprocessor.get_feature_names_out())
X_valid_processed = pd.DataFrame(X_valid_processed.toarray(), columns=preprocessor.get_feature_names_out())

In [None]:
X_valid_processed.head(5)

In [None]:
# 1. Fit the pipeline once to get transformed features
model.fit(X_train, y_train)

# 2. Extract transformed training and validation data
X_train_trans = model.named_steps['columntransformer'].transform(X_train)
X_valid_trans = model.named_steps['columntransformer'].transform(X_valid)

feature_names = model.named_steps['columntransformer'].get_feature_names_out()

# 3. Use the RF estimator inside the pipeline as base estimator for RFE
# rf = model.named_steps['randomforestclassifier']

In [None]:

auc_score_list = []
max_num_features = len(feature_names)

clf = LogisticRegression(max_iter=200)

for k in range(1, max_num_features + 1):
  rfe = RFE(
      estimator=clf,
      n_features_to_select=k,
      step=5,
      verbose=0,
  )
  X_train_sel = rfe.fit_transform(X_train_trans, y_train)
  X_valid_sel = rfe.transform(X_valid_trans)

  clf.fit(X_train_sel, y_train)
  y_pred = clf.predict_proba(X_valid_sel)[:, 1]

  roc_score = roc_auc_score(y_valid, y_pred)
  print(f'ROC: {roc_score}, Num features: {k}')
  auc_score_list.append(roc_score)

In [None]:
auc_score_list.index(max(auc_score_list))

In [None]:
list(zip(range(1, max_num_features +1 ), auc_score_list))

In [None]:
# base estimator for RFE
base_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# RFE wrapped estimator
rfe_clf = RFE(
    estimator=base_clf,
    n_features_to_select=47,  # choose k, or tune via CV
    step=1,
    verbose=1
)

# full pipeline: preprocess -> RFE+LogReg
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rfe', rfe_clf)
])

# fit and evaluate
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_valid)[:, 1]
roc = roc_auc_score(y_valid, y_pred)
print('ROC AUC:', roc)

# get mask of selected features after fit
# feature_names = model.named_steps['preprocess'].get_feature_names_out()
# support_mask = model.named_steps['rfe'].support_
# selected_features = feature_names[support_mask]
# print('Selected features:', selected_features)

In [None]:
# get mask of selected features after fit
feature_names = model.named_steps['preprocess'].get_feature_names_out()
support_mask = model.named_steps['rfe'].support_
selected_features = feature_names[support_mask]
print('Selected features:', selected_features)

In [None]:
len(selected_features)

In [10]:
features = ['num__annual_income',
            'num__debt_to_income_ratio',
            'num__credit_score',
            'num__loan_amount',
            'num__interest_rate',
            'onehot__gender_Female',
            'onehot__gender_Male',
            'onehot__loan_purpose_Business',
            'onehot__loan_purpose_Car',
            'onehot__loan_purpose_Debt consolidation',
            'onehot__loan_purpose_Education',
            'onehot__loan_purpose_Home',
            'onehot__loan_purpose_Medical',
            'onehot__loan_purpose_Other',
            'onehot__grade_subgrade_A2',
            'onehot__grade_subgrade_A3',
            'onehot__grade_subgrade_A5',
            'onehot__grade_subgrade_B1',
            'onehot__grade_subgrade_B2',
            'onehot__grade_subgrade_B3',
            'onehot__grade_subgrade_B4',
            'onehot__grade_subgrade_B5',
            'onehot__grade_subgrade_C1',
            'onehot__grade_subgrade_C2',
            'onehot__grade_subgrade_C3',
            'onehot__grade_subgrade_C4',
            'onehot__grade_subgrade_C5',
            'onehot__grade_subgrade_D1',
            'onehot__grade_subgrade_D2',
            'onehot__grade_subgrade_D3',
            'onehot__grade_subgrade_D4',
            'onehot__grade_subgrade_D5',
            'onehot__grade_subgrade_E3',
            'onehot__grade_subgrade_E4',
            'onehot__grade_subgrade_F1',
            'onehot__grade_subgrade_F2',
            'onehot__grade_subgrade_F3',
            'onehot__grade_subgrade_F4',
            'onehot__grade_subgrade_F5',
            'onehot__employment_status_Employed',
            'onehot__employment_status_Retired',
            'onehot__employment_status_Self-employed',
            'onehot__employment_status_Student',
            'onehot__employment_status_Unemployed',
            'cat__grade_subgrade',
            'cat__marital_status',
            'cat__education_level']
# ROC AUC: 0.9070720344346717

In [14]:
X_train_processed = X_train_processed[features]
X_valid_processed = X_valid_processed[features]


In [21]:
# python
def objective(trial: Trial):
    model = RandomForestClassifier(
        random_state=42,
        n_estimators=trial.suggest_int(
            name='n_estimators',
            low=50,
            high=250,
            step=25,
            log=False,
        ),
        min_samples_leaf=trial.suggest_int(
            name='min_samples_leaf',
            low=1,
            high=10,
            step=1,
            log=False,
        ),
        min_samples_split=trial.suggest_int(
            name='min_samples_split',
            low=2,
            high=10,
            step=1,
            log=False,
        ),
        max_depth=trial.suggest_int(
            name='max_depth',
            low=5,
            high=50,
            step=5,
            log=False,
        ),
    )
    score = cross_val_score(
        model,
        X_train_processed,
        y_train,
        n_jobs=-1,
        cv=3,
        scoring='roc_auc',
    )
    return score.mean()

In [22]:
study = create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective,
               n_trials=200,
               show_progress_bar=True,
               n_jobs=-1,
               callbacks=[EarlyStoppingCallback(patience=5, min_delta=1e-4)])

[I 2026-01-03 18:35:02,520] A new study created in memory with name: no-name-80373815-6f73-450c-a412-ef0a814ab865


  0%|          | 0/200 [00:00<?, ?it/s]


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[I 2026-01-03 18:40:37,990] Trial 1 finished with value: 0.8989667980279522 and parameters: {'n_estimators': 200, 'min_samples_leaf': 2, 'min_samples_split': 9, 'max_depth': 5}. Best is trial 1 with value: 0.8989667980279522.
[I 2026-01-03 18:41:04,761] Trial 6 finished with value: 0.8991279687616384 and parameters: {'n_estimators': 175, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_depth': 5}. Best is trial 6 with value: 0.8991279687616384.
[I 2026-01-03 18:41:49,484] Trial 0 finished with value: 0.8989423945634049 and parameters: {'n_estimators': 150, 'min_samples_leaf': 8, 'min_samples_split': 5, 'max_depth': 5}. Best is trial 6 with value: 0.8991279687616384.
[I 2026-01-03 18:42:47,886] Trial 3 finished with value: 0.9120320498347838 and parameters: {'n_estimators': 75, 'min_samples_leaf': 9, 'min_samples_split': 8, 'max_depth': 20}. Best is trial 3 with value: 0.9120320498347838.
[I 2026-01-03 18:43:15,871] Trial 2 finished with value: 0.908527073119055 and parameters: {'n_e


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[I 2026-01-03 18:46:56,449] Trial 9 finished with value: 0.9121498249158394 and parameters: {'n_estimators': 125, 'min_samples_leaf': 3, 'min_samples_split': 8, 'max_depth': 20}. Best is trial 9 with value: 0.9121498249158394.
[I 2026-01-03 18:48:11,608] Trial 10 finished with value: 0.9119482853779561 and parameters: {'n_estimators': 200, 'min_samples_leaf': 4, 'min_samples_split': 6, 'max_depth': 15}. Best is trial 9 with value: 0.9121498249158394.
[I 2026-01-03 18:51:25,639] Trial 11 finished with value: 0.91165228942649 and parameters: {'n_estimators': 225, 'min_samples_leaf': 6, 'min_samples_split': 9, 'max_depth': 45}. Best is trial 9 with value: 0.9121498249158394.
[I 2026-01-03 18:53:32,020] Trial 13 finished with value: 0.9124283887155674 and parameters: {'n_estimators': 150, 'min_samples_leaf': 6, 'min_samples_split': 4, 'max_depth': 20}. Best is trial 13 with value: 0.9124283887155674.
[I 2026-01-03 18:53:41,756] Trial 15 finished with value: 0.9091867663403143 and parameter

In [23]:
plot_param_importances(study)

In [24]:
plot_parallel_coordinate(study)

In [25]:
study.best_params

{'n_estimators': 150,
 'min_samples_leaf': 6,
 'min_samples_split': 4,
 'max_depth': 20}

In [None]:
best_params = {'n_estimators': 150, 'min_samples_leaf': 6, 'min_samples_split': 4, 'max_depth': 20}

In [26]:
study.best_value

0.9124283887155674

In [27]:
final_model = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
final_model.fit(X_train_processed, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",150
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",20
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",4
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",6
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [28]:
y_pred = final_model.predict_proba(X_valid_processed)[:, 1]
roc = roc_auc_score(y_valid, y_pred)
print('Final ROC AUC:', roc)

Final ROC AUC: 0.9126762449626581


In [31]:
X_test_processed = preprocessor.transform(test)
X_test_processed = pd.DataFrame(X_test_processed.toarray(), columns=preprocessor.get_feature_names_out())
X_test_processed = X_test_processed[features]

In [32]:
y_pred_test = final_model.predict_proba(X_test_processed)[:, 1]
y_pred = y_pred_test

In [37]:
submission = pd.DataFrame(
    index=test.index,
    columns=['loan_paid_back'],
    data=y_pred
).reset_index()

In [None]:
submission.to_csv('./data/loan_back/submission.csv', index=False)