In [1]:
# Exploring the Home Credit Dataset. The focus here is on the application_train.csv, since it
# contains the information used to train the model and defines if a loan was defaulted or not.

# 1. Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from collections import Counter


In [9]:
# Create the Data Paths and define Target
DATA_PATH = Path("../data/raw/application_train.csv")
OUT_DIR = Path("../data/clean/")
TARGET = "TARGET"
UNIQUE_KEY = "SK_ID_CURR"

In [10]:
# Load the data
df = pd.read_csv(DATA_PATH)
orig_shape = df.shape

In [11]:
# Create Helper functions to group very rare categories.
def safe_div(a, b):
    return np.where((b==0) | (pd.isna(b)), np.nan, a / b)

def reduce_rare(series, min_ratio=0.005):
    """Group very rare categories (<0.5%) into 'Other'."""
    vc = series.value_counts(normalize=True, dropna=False)
    keep = set(vc[vc >= min_ratio].index)
    return series.where(series.isin(keep), other="Other")

In [12]:
# Explore the dataset
missing = df.isna().sum().sort_values(ascending=False).to_frame("n_missing")
missing["pct_missing"] = (missing["n_missing"]/len(df)*100).round(2)
missing.to_csv(OUT_DIR/"missing_summary.csv")

with open(OUT_DIR/"quick_info.txt","w") as f:
    f.write(f"Rows x Cols: {df.shape}\n")
    if TARGET in df:
        f.write(df[TARGET].value_counts().to_string()+"\n")

In [13]:
# Perform Data Quality Checks
dq = {}
dq["unique_ids"] = (df[UNIQUE_KEY].is_unique if UNIQUE_KEY in df.columns else "UNAVAILABLE")
dq["duplicate_rows"] = int(df.duplicated().sum())
dq["constant_columns"] = [c for c in df.columns if df[c].nunique(dropna=False) == 1]
dq["high_missing_over_60pct"] = missing.index[missing["pct_missing"]>=60].tolist()

if "DAYS_EMPLOYED" in df:
    dq["DAYS_EMPLOYED_365243_count"] = int((df["DAYS_EMPLOYED"]==365243).sum())
if "CODE_GENDER" in df:
    dq["CODE_GENDER_counts"] = df["CODE_GENDER"].value_counts(dropna=False).to_dict()

pd.Series(dq, dtype="object").to_json(OUT_DIR/"data_quality_checks.json", indent=2)

In [14]:
# Data Cleaning
dfc = df.copy() # Create a copy of the datafrane

# 1) sentinel in DAYS_EMPLOYED
if "DAYS_EMPLOYED" in dfc:
    dfc.loc[dfc["DAYS_EMPLOYED"] == 365243, "DAYS_EMPLOYED"] = np.nan

# 2) check for rare/invalid gender label
if "CODE_GENDER" in dfc:
    dfc.loc[dfc["CODE_GENDER"] == "XNA", "CODE_GENDER"] = np.nan

# 3) check for impossible counts -> NaN
for c in ["CNT_CHILDREN", "CNT_FAM_MEMBERS"]:
    if c in dfc:
        dfc.loc[dfc[c] < 0, c] = np.nan

In [15]:
# Feature Engineering
fe = dfc.copy()

# Make sure the years are human-readable
if "DAYS_BIRTH" in fe:
    fe["AGE_YEARS"] = (-fe["DAYS_BIRTH"]/365.25).round(2)
if "DAYS_EMPLOYED" in fe:
    fe["EMPLOYED_YEARS"] = (-fe["DAYS_EMPLOYED"]/365.25).round(2)

# ratios
if {"AMT_CREDIT","AMT_INCOME_TOTAL"}.issubset(fe.columns):
    fe["CREDIT_TO_INCOME"] = safe_div(fe["AMT_CREDIT"], fe["AMT_INCOME_TOTAL"])
if {"AMT_ANNUITY","AMT_INCOME_TOTAL"}.issubset(fe.columns):
    fe["ANNUITY_TO_INCOME"] = safe_div(fe["AMT_ANNUITY"], fe["AMT_INCOME_TOTAL"])
if {"AMT_ANNUITY","AMT_CREDIT"}.issubset(fe.columns):
    fe["PAYMENT_RATE"] = safe_div(fe["AMT_ANNUITY"], fe["AMT_CREDIT"])
if {"AMT_GOODS_PRICE","AMT_CREDIT"}.issubset(fe.columns):
    fe["GOODS_CREDIT_RATIO"] = safe_div(fe["AMT_GOODS_PRICE"], fe["AMT_CREDIT"])

# household
if {"CNT_CHILDREN","CNT_FAM_MEMBERS"}.issubset(fe.columns):
    fe["CHILDREN_RATIO"] = safe_div(fe["CNT_CHILDREN"], fe["CNT_FAM_MEMBERS"])
if {"AMT_INCOME_TOTAL","CNT_FAM_MEMBERS"}.issubset(fe.columns):
    fe["INCOME_PER_PERSON"] = safe_div(fe["AMT_INCOME_TOTAL"], fe["CNT_FAM_MEMBERS"])

# docs aggregate
doc_cols = [c for c in fe.columns if c.startswith("FLAG_DOCUMENT")]
if doc_cols:
    fe["N_DOCS_PROVIDED"] = fe[doc_cols].sum(axis=1)

# EXT_SOURCE aggregates
ext_cols = [c for c in fe.columns if c.startswith("EXT_SOURCE")]
if ext_cols:
    fe["EXT_SOURCES_MEAN"] = fe[ext_cols].mean(axis=1)
    fe["EXT_SOURCES_MIN"]  = fe[ext_cols].min(axis=1)
    fe["EXT_SOURCES_MAX"]  = fe[ext_cols].max(axis=1)

# log transforms for skewed amounts
for col in ["AMT_INCOME_TOTAL","AMT_CREDIT","AMT_ANNUITY","AMT_GOODS_PRICE"]:
    if col in fe:
        fe[f"LOG1P_{col}"] = np.log1p(fe[col].clip(lower=0))

In [16]:
# Imputation and Encoding
num_cols = fe.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = fe.select_dtypes(include=["object"]).columns.tolist()

# numeric: median; categorical: "Missing"
fe[num_cols] = fe[num_cols].fillna(fe[num_cols].median())
fe[cat_cols] = fe[cat_cols].fillna("Missing")

In [17]:
# Reduce the rare categories
for c in cat_cols:
    fe[c] = reduce_rare(fe[c], min_ratio=0.005)

In [23]:
# Drop columns with 80%+ missin
cols_to_consider_drop = set(missing.index[missing["pct_missing"]>=80].tolist())
cols_to_drop = [c for c in fe.columns if c in cols_to_consider_drop]

In [24]:
n_sample = min(20000, len(fe))
fe_sample = fe.iloc[:n_sample].copy()
model_matrix_sample = pd.get_dummies(fe_sample, columns=cat_cols, dummy_na=False)


In [25]:
# Check for correlation
if TARGET in fe:
    numeric_cols = fe.select_dtypes(include=[np.number]).columns
    corr = fe[numeric_cols].corrwith(df[TARGET]).sort_values(ascending=False)
    corr.to_frame("corr_with_TARGET").to_csv(OUT_DIR/"feature_target_correlations.csv")

In [26]:
fe.head(10000).to_csv(OUT_DIR/"application_train_engineered_sample.csv", index=False) 
model_matrix_sample.to_csv(OUT_DIR/"application_train_model_matrix_sample.csv", index=False)


In [27]:
# Create Plots for AGE Years and Employed Years
plt.figure()
df[TARGET].value_counts().sort_index().plot(kind="bar")
plt.title("TARGET distribution (0=repaid, 1=default)")
plt.xlabel("TARGET"); plt.ylabel("Count")
plt.tight_layout(); plt.savefig(OUT_DIR/"target_distribution.png"); plt.close()

if "AGE_YEARS" in fe:
    plt.figure()
    fe["AGE_YEARS"].dropna().plot(kind="hist", bins=40)
    plt.title("Distribution of AGE_YEARS"); plt.xlabel("Age (years)"); plt.ylabel("Count")
    plt.tight_layout(); plt.savefig(OUT_DIR/"hist_age_years.png"); plt.close()

if "EMPLOYED_YEARS" in fe:
    plt.figure()
    fe["EMPLOYED_YEARS"].dropna().clip(upper=60).plot(kind="hist", bins=40)
    plt.title("Distribution of EMPLOYED_YEARS (clipped at 60)"); plt.xlabel("Years"); plt.ylabel("Count")
    plt.tight_layout(); plt.savefig(OUT_DIR/"hist_employed_years.png"); plt.close()

In [28]:
summary = {
    "raw_shape": df.shape,
    "engineered_shape": fe.shape,
    "model_matrix_sample_shape": model_matrix_sample.shape,
    "dropped_candidates(>=80%missing)_count": len(cols_to_drop),
}
pd.Series(summary, dtype="object").to_csv(OUT_DIR/"summary_stats.csv")

print("Done. See ./outputs for CSVs & PNGs.")

Done. See ./outputs for CSVs & PNGs.
