# Phase 2: Feature Engineering

Having completed our EDA, we now clean the data, engineer predictive features, and prepare everything for modelling.

In [2]:
import numpy as np
import pandas as pd

# 1. Load cleaned subset from Phase 1 (or re‐load raw and apply initial filters)
df = pd.read_csv('/Users/Cathaml/Desktop/accepted_cleaned.csv', header=0, low_memory=False)
df = df[df.loan_status.isin(['Fully Paid','Charged Off'])].copy()
df['default_flag'] = (df.loan_status == 'Charged Off').astype(int)

## 1. Data Cleaning & Preprocessing

We first convert types, drop unneeded columns, and impute or encode missing values.

In [4]:
# 1.1 Drop columns with >80% missing
missing = df.isnull().mean()
to_drop = missing[missing > 0.8].index
df.drop(columns=to_drop, inplace=True)

# 1.2 Robustly strip “%” and convert to float
for col in ['int_rate', 'revol_util']:
    # cast to string, remove “%”, then coerce to numeric
    df[col] = pd.to_numeric(
        df[col]
          .astype(str)
          .str.replace('%', '', regex=False),
        errors='coerce'
    )

# 1.3 Parse dates
df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y')
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%Y')

# 1.4 Impute numeric and encode categoricals
num_cols = df.select_dtypes(include='number').columns.drop('default_flag')
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status']
for c in cat_cols:
    df[c] = df[c].fillna('Missing').astype('category')


## 2. Feature Engineering

Based on our EDA insights, we create:
- **Binned loan amounts** at LendingClub’s tiers  
- **Debt-to-income buckets**  
- **Credit-utilization ratios**  
- **Vintage features** from issue date  
- **Ordinal encodings** for grade/subgrade  

In [5]:
# 2.1 Loan‐amount bins
tiers = [0,5000,10000,15000,20000,25000,30000,35000,40000]
labels = [f"{tiers[i]}–{tiers[i+1]}" for i in range(len(tiers)-1)]
df['amt_bin'] = pd.cut(df['loan_amnt'], bins=tiers, labels=labels)

# 2.2 DTI bucket
df['dti_bin'] = pd.qcut(df['dti'], 5, labels=False)

# 2.3 Credit utilization proxy
df['util_ratio'] = df['revol_bal'] / (df['funded_amnt'] + 1)

# 2.4 Vintage features
df['issue_year']  = df['issue_d'].dt.year
df['issue_month'] = df['issue_d'].dt.month

# 2.5 Ordinal encode grade/subgrade
grade_map = {g: i for i, g in enumerate(sorted(df['grade'].cat.categories))}
df['grade_ord'] = df['grade'].map(grade_map)
df['subgrade_ord'] = df['sub_grade'].cat.codes  # preserves A1–G5 order

# 2.6 One‐hot encode rare categoricals
df = pd.get_dummies(df, columns=['term','home_ownership','verification_status','amt_bin'], drop_first=True)

We save the processed dataset for modeling.

In [7]:
df.to_csv('/Users/Cathaml/Desktop/loans_fe.csv.csv', index=False)