# 03 - Feature Engineering

Create and encode features for modeling.

**Steps**:

1. Create financial ratios
2. One-Hot encode categorical variables
3. Label encode high-cardinality categoricals
4. Prepare final feature set


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# Paths
INTERIM_DATA = Path("../data/interim/lending_club_cleaned.parquet")
PROCESSED_DATA = Path("../data/processed")
PROCESSED_DATA.mkdir(parents=True, exist_ok=True)

In [2]:
# Load cleaned data
df = pd.read_parquet(INTERIM_DATA)
print(f"Loaded data shape: {df.shape}")

Loaded data shape: (1345310, 70)


## 1. Create Financial Ratios

Domain-specific features that are meaningful for credit risk.


In [None]:
# Loan-to-Income Ratio
# How much of annual income is the loan amount?
if "loan_amnt" in df.columns and "annual_inc" in df.columns:
    df["loan_to_income"] = df["loan_amnt"] / df["annual_inc"]
    # Cap extreme values
    df["loan_to_income"] = df["loan_to_income"].clip(
        upper=df["loan_to_income"].quantile(0.99)
    )
    print(
        f"loan_to_income: mean={df['loan_to_income'].mean():.3f}, max={df['loan_to_income'].max():.3f}"
    )

loan_to_income: mean=0.213, max=0.500


In [None]:
# Payment Burden (Annual Payment / Annual Income)
# What percentage of income goes to loan payments?
if "installment" in df.columns and "annual_inc" in df.columns:
    df["payment_to_income"] = (df["installment"] * 12) / df["annual_inc"]
    df["payment_to_income"] = df["payment_to_income"].clip(
        upper=df["payment_to_income"].quantile(0.99)
    )
    print(f"payment_to_income: mean={df['payment_to_income'].mean():.3f}")

payment_to_income: mean=0.079


In [None]:
# High Utilization Flag
# Credit utilization > 80% is a risk factor
if "revol_util" in df.columns:
    df["high_utilization"] = (df["revol_util"] > 80).astype(int)
    print(f"high_utilization: {df['high_utilization'].mean():.1%} of borrowers")

high_utilization: 14.6% of borrowers


In [None]:
# DTI Risk Category
if "dti" in df.columns:
    df["dti_risk"] = pd.cut(
        df["dti"],
        bins=[-np.inf, 10, 20, 35, np.inf],
        labels=["low", "moderate", "high", "very_high"],
    )
    print("DTI Risk distribution:")
    print(df["dti_risk"].value_counts())

DTI Risk distribution:
dti_risk
moderate     562485
high         501731
low          246179
very_high     34915
Name: count, dtype: int64


In [None]:
# Income Category
if "annual_inc" in df.columns:
    df["income_category"] = pd.cut(
        df["annual_inc"],
        bins=[0, 30000, 60000, 100000, 200000, np.inf],
        labels=["low", "lower_middle", "middle", "upper_middle", "high"],
    )
    print("Income category distribution:")
    print(df["income_category"].value_counts())

Income category distribution:
income_category
lower_middle    518592
middle          477381
upper_middle    222096
low              98935
high             27945
Name: count, dtype: int64


In [None]:
# Log transformations for skewed distributions
if "annual_inc" in df.columns:
    df["log_annual_inc"] = np.log1p(df["annual_inc"])

if "revol_bal" in df.columns:
    df["log_revol_bal"] = np.log1p(df["revol_bal"])

print("Added log transformations for skewed features")

Added log transformations for skewed features


## 2. Encode Categorical Variables


In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
print(f"Categorical columns ({len(categorical_cols)}):")
for col in categorical_cols:
    n_unique = df[col].nunique()
    print(f"  {col}: {n_unique} unique values")

Categorical columns (13):
  term: 2 unique values
  grade: 7 unique values
  sub_grade: 35 unique values
  emp_length: 12 unique values
  home_ownership: 6 unique values
  verification_status: 3 unique values
  purpose: 14 unique values
  earliest_cr_line: 739 unique values
  initial_list_status: 2 unique values
  application_type: 2 unique values
  disbursement_method: 2 unique values
  dti_risk: 4 unique values
  income_category: 5 unique values


In [None]:
# One-Hot Encoding for low-cardinality categoricals
low_card_cols = [
    "term",
    "grade",
    "home_ownership",
    "verification_status",
    "purpose",
    "application_type",
    "initial_list_status",
    "dti_risk",
    "income_category",
]

# Filter to existing columns
low_card_cols = [c for c in low_card_cols if c in df.columns]
print(f"One-Hot encoding columns: {low_card_cols}")

df_encoded = pd.get_dummies(df, columns=low_card_cols, drop_first=True)
print(f"Shape after One-Hot encoding: {df_encoded.shape}")

One-Hot encoding columns: ['term', 'grade', 'home_ownership', 'verification_status', 'purpose', 'application_type', 'initial_list_status', 'dti_risk', 'income_category']
Shape after One-Hot encoding: (1345310, 104)


In [None]:
# Label Encoding for high-cardinality categoricals (sub_grade)
if "sub_grade" in df_encoded.columns:
    le = LabelEncoder()
    df_encoded["sub_grade_encoded"] = le.fit_transform(
        df_encoded["sub_grade"].astype(str)
    )
    df_encoded = df_encoded.drop(columns=["sub_grade"])
    print(
        f"Label encoded sub_grade: {df_encoded['sub_grade_encoded'].nunique()} values"
    )

Label encoded sub_grade: 35 values


In [None]:
# Drop remaining object columns (text that can't be encoded meaningfully)
remaining_object_cols = df_encoded.select_dtypes(include=["object"]).columns.tolist()
if remaining_object_cols:
    print(f"Dropping remaining object columns: {remaining_object_cols}")
    df_encoded = df_encoded.drop(columns=remaining_object_cols, errors="ignore")

Dropping remaining object columns: ['emp_length', 'earliest_cr_line', 'disbursement_method']


In [None]:
# Drop original emp_length (we have emp_length_numeric)
if "emp_length" in df_encoded.columns:
    df_encoded = df_encoded.drop(columns=["emp_length"])

## 3. Final Cleanup


In [14]:
# Handle any remaining missing values
missing_count = df_encoded.isnull().sum().sum()
if missing_count > 0:
    print(f"Filling {missing_count} remaining missing values with median")
    df_encoded = df_encoded.fillna(df_encoded.median())
else:
    print("No missing values")

No missing values


In [15]:
# Handle infinite values
df_encoded = df_encoded.replace([np.inf, -np.inf], np.nan)
df_encoded = df_encoded.fillna(df_encoded.median())

In [16]:
# Final data types check
print("\nData types:")
print(df_encoded.dtypes.value_counts())


Data types:
float64    62
bool       36
int64       3
Name: count, dtype: int64


In [None]:
# Final summary
print("\n" + "=" * 50)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 50)
print(f"Final shape: {df_encoded.shape}")
print(f"Number of features: {df_encoded.shape[1] - 1}")  # Exclude target
print(f"Target: 'default'")
print(f"Default rate: {df_encoded['default'].mean():.2%}")


FEATURE ENGINEERING SUMMARY
Final shape: (1345310, 101)
Number of features: 100
Target: 'default'
Default rate: 19.96%


In [None]:
# List all features
features = [c for c in df_encoded.columns if c != "default"]
print(f"\nFeatures ({len(features)}):")
for i, feat in enumerate(features, 1):
    print(f"{i:3}. {feat}")


Features (100):
  1. loan_amnt
  2. int_rate
  3. installment
  4. annual_inc
  5. dti
  6. delinq_2yrs
  7. fico_range_low
  8. fico_range_high
  9. inq_last_6mths
 10. open_acc
 11. pub_rec
 12. revol_bal
 13. revol_util
 14. total_acc
 15. last_fico_range_high
 16. last_fico_range_low
 17. collections_12_mths_ex_med
 18. policy_code
 19. acc_now_delinq
 20. tot_coll_amt
 21. tot_cur_bal
 22. total_rev_hi_lim
 23. acc_open_past_24mths
 24. avg_cur_bal
 25. bc_open_to_buy
 26. bc_util
 27. chargeoff_within_12_mths
 28. delinq_amnt
 29. mo_sin_old_il_acct
 30. mo_sin_old_rev_tl_op
 31. mo_sin_rcnt_rev_tl_op
 32. mo_sin_rcnt_tl
 33. mort_acc
 34. mths_since_recent_bc
 35. mths_since_recent_inq
 36. num_accts_ever_120_pd
 37. num_actv_bc_tl
 38. num_actv_rev_tl
 39. num_bc_sats
 40. num_bc_tl
 41. num_il_tl
 42. num_op_rev_tl
 43. num_rev_accts
 44. num_rev_tl_bal_gt_0
 45. num_sats
 46. num_tl_120dpd_2m
 47. num_tl_30dpd
 48. num_tl_90g_dpd_24m
 49. num_tl_op_past_12m
 50. pct_tl_nvr_d

## 4. Save Processed Data


In [None]:
# Save to processed folder
output_path = PROCESSED_DATA / "lending_club_processed.parquet"
df_encoded.to_parquet(output_path, index=False)

import os

file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
print(f"Saved to: {output_path}")
print(f"File size: {file_size_mb:.1f} MB")

Saved to: ../data/processed/lending_club_processed.parquet
File size: 108.9 MB


## Next Steps

Proceed to `04_eda.ipynb` for:

- Exploratory Data Analysis
- Visualizations
- Correlation analysis
