# Baseline Model — Application Data Only

## Objective
Build a reproducible baseline model using only the main application table.

## Why a baseline matters
* Establish a performance reference
* Validate preprocessing pipeline
* Ensure proper cross-validation strategy

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier

In [4]:
train = pd.read_csv("../data/application_train.csv")
test  = pd.read_csv("../data/application_test.csv")

In [5]:
y = train["TARGET"]
X = train.drop(columns=["TARGET", "SK_ID_CURR"])
test_ids = test["SK_ID_CURR"]
test = test.drop(columns=["SK_ID_CURR"])

In [6]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

## Preprocessing Strategy

### Numeric features
* Median imputation

### Categorical features
* One-hot encoding
* Ignore unseen categories in test data


In [11]:
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [12]:
X_num = pd.DataFrame(
    num_imputer.fit_transform(X[num_cols]),
    columns=num_cols
)

test_num = pd.DataFrame(
    num_imputer.transform(test[num_cols]),
    columns=num_cols
)

In [13]:
X_cat = pd.DataFrame(
    cat_imputer.fit_transform(X[cat_cols]),
    columns=cat_cols
)

test_cat = pd.DataFrame(
    cat_imputer.transform(test[cat_cols]),
    columns=cat_cols
)

X_cat = pd.DataFrame(
    ohe.fit_transform(X_cat),
    columns=ohe.get_feature_names_out(cat_cols)
)

test_cat = pd.DataFrame(
    ohe.transform(test_cat),
    columns=ohe.get_feature_names_out(cat_cols)
)

In [17]:
X_processed = pd.concat([X_num, X_cat], axis=1)
test_processed = pd.concat([test_num, test_cat], axis=1)

print(X_processed.shape)

(307511, 244)


In [18]:
import re

def clean_feature_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [
        re.sub(r'[^A-Za-z0-9_]+', '_', str(c))  # keep only letters/numbers/underscore
        for c in df.columns
    ]
    return df

X_processed = clean_feature_names(X_processed)
test_processed = clean_feature_names(test_processed)

# ensure columns match exactly
test_processed = test_processed.reindex(columns=X_processed.columns, fill_value=0)

In [19]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []

In [20]:
for train_idx, valid_idx in folds.split(X_processed, y):

    X_train, X_valid = X_processed.iloc[train_idx], X_processed.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="auc",
        callbacks=[]
    )

    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)

    scores.append(auc)

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029874 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11328
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 234
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11418
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 234
[LightGBM]

In [21]:
print("Mean AUC:", np.mean(scores))
print("Std AUC:", np.std(scores))

Mean AUC: 0.7561639696159295
Std AUC: 0.004412255446440042


# Baseline Model Results

## Cross-Validation Performance

* Mean AUC: **0.7562**
* Standard deviation: **0.0044**

### Interpretation
* The model shows **stable performance across folds**, indicating a robust validation setup.
* This baseline uses **only application-level features**, without external behavioral data.
* The result is strong for a first model and provides a reliable benchmark for future feature engineering.

### Why AUC?
The dataset is highly imbalanced, and AUC measures the model’s ability to correctly rank high-risk vs low-risk clients across all thresholds.


In [23]:
model.fit(X_processed, y)

test_preds = model.predict_proba(test_processed)[:, 1]

submission = pd.DataFrame({
    "SK_ID_CURR": test_ids,
    "TARGET": test_preds
})

submission.to_csv(ROOT / "outputs" / "baseline_submission.csv", index=False)

[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11427
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 235
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486


### Validation Strategy
* 5-fold Stratified Cross-Validation
* No data leakage
* Consistent preprocessing across folds

This ensures that performance estimates generalize to unseen data.