# 03 — Feature Engineering

**Goal:** Create clear, business-friendly features from the cleaned loan dataset to improve model signal and interpretability.

**Input:** `data/interim/loan_data_clean_start.csv`  
**Outputs:**  
- `data/processed/loan_features.csv`  
- `data/interim/feature_dictionary.csv`

In [2]:
!mkdir -p Loan_Repayment_Behaviour_Analytics/{data/raw,data/interim,data/processed,notebooks,images,reports,src}
!touch Loan_Repayment_Behaviour_Analytics/{README.md,requirements.txt}
!ls -R Loan_Repayment_Behaviour_Analytics

Loan_Repayment_Behaviour_Analytics:
data  images  notebooks  README.md  reports  requirements.txt  src

Loan_Repayment_Behaviour_Analytics/data:
interim  processed  raw

Loan_Repayment_Behaviour_Analytics/data/interim:
data_dictionary_seed.csv  feature_dictionary.csv  loan_data_clean_start.csv
eda_hypotheses.txt	  intake_summary.csv

Loan_Repayment_Behaviour_Analytics/data/processed:
loan_features.csv

Loan_Repayment_Behaviour_Analytics/data/raw:
loan_data_raw.csv

Loan_Repayment_Behaviour_Analytics/images:
ApplicantIncome_by_target.png  approval_by_Self_Employed.png
approval_by_Dependents.png     CoapplicantIncome_by_target.png
approval_by_Education.png      correlation_heatmap.png
approval_by_Gender.png	       Credit_History_by_target.png
approval_by_Loan_ID.png        LoanAmount_by_target.png
approval_by_Married.png        Loan_Amount_Term_by_target.png
approval_by_Property_Area.png  numeric_distributions.png

Loan_Repayment_Behaviour_Analytics/notebooks:

Loan_Repayment_Behaviour_A

In [3]:
import sys
from pathlib import Path
# Add src folder to Python path (so imports work in notebooks)
ROOT = Path.cwd()
if ROOT.name == "content":
    ROOT = ROOT / "Loan_Repayment_Behaviour_Analytics"

# if running from notebooks folder, go up one level
elif ROOT.name.lower() == "notebooks":
    ROOT = ROOT.parent
ROOT

PosixPath('/content/Loan_Repayment_Behaviour_Analytics')

In [4]:
# ---  Environment Setup ---

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(str(ROOT / "src"))

from utils_paths import get_project_paths

paths = get_project_paths()

# Unpack for easy use
env = paths["env"]
DATA_RAW = paths["DATA_RAW"]
DATA_INTERIM = paths["DATA_INTERIM"]
DATA_PROCESSED = paths["DATA_PROCESSED"]
IMAGES = paths["IMAGES"]

print(f"Environment: {env}")
print("RAW:", DATA_RAW)
print("INTERIM:", DATA_INTERIM)
print("PROCESSED:", DATA_PROCESSED)

Environment: Colab
RAW: /content/Loan_Repayment_Behaviour_Analytics/data/raw
INTERIM: /content/Loan_Repayment_Behaviour_Analytics/data/interim
PROCESSED: /content/Loan_Repayment_Behaviour_Analytics/data/processed


In [5]:
clean_path = DATA_INTERIM / "loan_data_clean_start.csv"
df = pd.read_csv(clean_path)
print(f"Loaded {clean_path.name} → shape {df.shape}")
df.head()


Loaded loan_data_clean_start.csv → shape (614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
# Normalize a few categorical fields for consistency
def norm_yesno(s):
    return (s.astype(str)
              .str.strip()
              .str.upper()
              .map({"Y":"Y","YES":"Y","N":"N","NO":"N"}))

if "Married" in df.columns:
    df["Married"] = norm_yesno(df["Married"])
if "Self_Employed" in df.columns:
    df["Self_Employed"] = norm_yesno(df["Self_Employed"])
if "Gender" in df.columns:
    df["Gender"] = df["Gender"].astype(str).str.strip().str.title()

# Dependents: convert '3+' → 3 (int)
if "Dependents" in df.columns:
    df["Dependents"] = df["Dependents"].astype(str).str.replace("+","", regex=False)
    df["Dependents"] = pd.to_numeric(df["Dependents"], errors="coerce").fillna(0).astype(int)

# Education: standardise to 'Graduate' / 'Not Graduate'
if "Education" in df.columns:
    df["Education"] = df["Education"].astype(str).str.strip().str.title()
    df["Education"] = df["Education"].replace({"Not Graduate":"Not Graduate", "Graduate":"Graduate"})


In [7]:
import numpy as np
# Safeguard numeric columns
for c in ["ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# 1) TotalIncome: monthly total income
df["TotalIncome"] = df[["ApplicantIncome","CoapplicantIncome"]].sum(axis=1)

# 2) IncomeToLoanRatio: income relative to requested loan amount (loan is in thousands)
# Avoid zero/NaN division
loan_amt = df["LoanAmount"].replace(0, np.nan)
df["IncomeToLoanRatio"] = df["TotalIncome"] / (loan_amt * 1000)

# 3) EMI (approx): monthly installment if loan fully amortised
# EMI ≈ (LoanAmount*1000) / Loan_Amount_Term
term = df["Loan_Amount_Term"].replace(0, np.nan)
df["EMI"] = (df["LoanAmount"]*1000) / term

# 4) BalanceIncome: remaining income after EMI
df["BalanceIncome"] = df["TotalIncome"] - df["EMI"]

# 5) FamilySize: applicant + dependents + spouse (if Married=Y then +1)
df["FamilySize"] = 1 + df.get("Dependents", 0)
if "Married" in df.columns:
    df["FamilySize"] = df["FamilySize"] + (df["Married"]=="Y").astype(int)

# 6) HasCreditHistory: binary flag
if "Credit_History" in df.columns:
    df["HasCreditHistory"] = (df["Credit_History"] == 1).astype(int)

# 7) IsGraduate & IsSelfEmployed flags
if "Education" in df.columns:
    df["IsGraduate"] = (df["Education"]=="Graduate").astype(int)
if "Self_Employed" in df.columns:
    df["IsSelfEmployed"] = (df["Self_Employed"]=="Y").astype(int)

# 8) Income bands (for dashboard/fairness)
df["IncomeBand"] = pd.cut(
    df["TotalIncome"],
    bins=[-np.inf, 2500, 5000, 10000, 20000, np.inf],
    labels=["<2.5k","2.5–5k","5–10k","10–20k",">20k"]
)

# 9) Loan amount bands (thousands)
df["LoanAmtBand"] = pd.cut(
    df["LoanAmount"],
    bins=[-np.inf, 100, 150, 200, 300, np.inf],
    labels=["<100k","100–150k","150–200k","200–300k",">300k"]
)

# 10) Term bands (months)
df["TermBand"] = pd.cut(
    df["Loan_Amount_Term"],
    bins=[-np.inf, 180, 240, 300, 360, np.inf],
    labels=["<=180","181–240","241–300","301–360",">360"]
)

# Clean any lingering infs from divisions
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [8]:
# For engineered columns, simple median/mode fills to keep dataset model-ready
fill_median = ["IncomeToLoanRatio","EMI","BalanceIncome","TotalIncome"]
for c in fill_median:
    if c in df.columns:
        df[c] = df[c].fillna(df[c].median())

fill_mode = ["IncomeBand","LoanAmtBand","TermBand"]
for c in fill_mode:
    if c in df.columns:
        df[c] = df[c].astype("category")
        if df[c].isna().any():
            df[c] = df[c].cat.add_categories(["Unknown"]).fillna("Unknown")

print("Remaining NA after fills:", int(df.isna().sum().sum()))


Remaining NA after fills: 121


In [9]:
feature_notes = {
    "TotalIncome": "Sum of applicant and co-applicant monthly income (currency units).",
    "IncomeToLoanRatio": "Total monthly income / (LoanAmount*1000). Higher may indicate affordability.",
    "EMI": "Approximate monthly installment = (LoanAmount*1000)/Loan_Amount_Term.",
    "BalanceIncome": "TotalIncome - EMI. Proxy for residual income after repayment.",
    "FamilySize": "Applicant (1) + dependents + spouse if married.",
    "HasCreditHistory": "1 if Credit_History == 1, else 0.",
    "IsGraduate": "1 if Education == 'Graduate'.",
    "IsSelfEmployed": "1 if Self_Employed == 'Y'.",
    "IncomeBand": "Binned TotalIncome for segmentation.",
    "LoanAmtBand": "Binned LoanAmount (thousands) for segmentation.",
    "TermBand": "Binned loan term (months) for segmentation."
}

# Build dictionary dataframe
cols = list(feature_notes.keys())
existing = [c for c in cols if c in df.columns]
rows = []
for c in existing:
    rows.append({
        "feature": c,
        "dtype": str(df[c].dtype),
        "description": feature_notes[c]
    })
feat_dict = pd.DataFrame(rows)

# Save
feat_dict_path = DATA_INTERIM / "feature_dictionary.csv"
feat_dict.to_csv(feat_dict_path, index=False)
print("Saved feature dictionary →", feat_dict_path)
feat_dict


Saved feature dictionary → /content/Loan_Repayment_Behaviour_Analytics/data/interim/feature_dictionary.csv


Unnamed: 0,feature,dtype,description
0,TotalIncome,float64,Sum of applicant and co-applicant monthly inco...
1,IncomeToLoanRatio,float64,Total monthly income / (LoanAmount*1000). High...
2,EMI,float64,Approximate monthly installment = (LoanAmount*...
3,BalanceIncome,float64,TotalIncome - EMI. Proxy for residual income a...
4,FamilySize,int64,Applicant (1) + dependents + spouse if married.
5,HasCreditHistory,int64,"1 if Credit_History == 1, else 0."
6,IsGraduate,int64,1 if Education == 'Graduate'.
7,IsSelfEmployed,int64,1 if Self_Employed == 'Y'.
8,IncomeBand,category,Binned TotalIncome for segmentation.
9,LoanAmtBand,category,Binned LoanAmount (thousands) for segmentation.


In [10]:
# Keep original columns + engineered ones; small dataset so it's fine
out_path = DATA_PROCESSED / "loan_features.csv"
df.to_csv(out_path, index=False)
print(f" Saved engineered dataset → {out_path} | shape {df.shape}")


 Saved engineered dataset → /content/Loan_Repayment_Behaviour_Analytics/data/processed/loan_features.csv | shape (614, 24)


In [12]:
checks = {}
if "Loan_Status" in df.columns and df["Loan_Status"].dtype == object:
    df["Loan_Status"] = df["Loan_Status"].map({"Y": 1, "N": 0})

if "Loan_Status" in df.columns:
    checks["target_mean"] = df["Loan_Status"].mean().round(3)

for c in ["IncomeToLoanRatio","EMI","BalanceIncome"]:
    if c in df.columns:
        checks[f"{c}_nonneg"] = bool((df[c] >= 0).all())

for c in ["IncomeBand","LoanAmtBand","TermBand"]:
    if c in df.columns:
        checks[f"{c}_has_levels"] = df[c].nunique() >= 2

checks


{'target_mean': np.float64(0.687),
 'IncomeToLoanRatio_nonneg': True,
 'EMI_nonneg': True,
 'BalanceIncome_nonneg': False,
 'IncomeBand_has_levels': True,
 'LoanAmtBand_has_levels': True,
 'TermBand_has_levels': True}