# Loan Default Prediction: Data Cleaning & Preprocessing

From our EDA analysis and visualizations, we found a few data quality problems that must be fixed before building any models:

Very large income values that appear to be errors

DTI values set to 999, which are placeholders

Missing entries in emp_length and revol_util

Categorical fields that need to be encoded

In this notebook, weâ€™ll clean and preprocess the dataset to make it ready for modeling.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load your data
df = pd.read_csv('../raw_data/accepted_2007_to_2018Q4.csv', nrows=15000)

  df = pd.read_csv('../raw_data/accepted_2007_to_2018Q4.csv', nrows=15000)


In [2]:
# Select key columns (same as before)
key_columns = [
    'loan_amnt', 'int_rate', 'grade', 'emp_length', 'annual_inc',
    'dti', 'fico_range_low', 'revol_util', 'purpose',
    'home_ownership', 'loan_status'
]
df_subset = df[key_columns].copy()

In [3]:
##  Handle Missing Values

# Check missing values
print("Missing Values Before Cleaning:")
print(df_subset.isnull().sum())
print(f"\nTotal missing: {df_subset.isnull().sum().sum()}")

# Create a clean copy
df_clean = df_subset.copy()


df_clean['emp_length'] = df_clean['emp_length'].fillna('Unknown')
df_clean['revol_util'] = df_clean['revol_util'].fillna(df_clean['revol_util'].median())

# Verify missing values are handled
print("\n" + "="*50)
print("Missing Values After Cleaning:")
print(df_clean.isnull().sum())
print(f"\nTotal missing: {df_clean.isnull().sum().sum()}")

Missing Values Before Cleaning:
loan_amnt           0
int_rate            0
grade               0
emp_length        895
annual_inc          0
dti                 0
fico_range_low      0
revol_util          7
purpose             0
home_ownership      0
loan_status         0
dtype: int64

Total missing: 902

Missing Values After Cleaning:
loan_amnt         0
int_rate          0
grade             0
emp_length        0
annual_inc        0
dti               0
fico_range_low    0
revol_util        0
purpose           0
home_ownership    0
loan_status       0
dtype: int64

Total missing: 0


In [5]:
# Remove extreme income outlier
df_clean = df_clean[df_clean['annual_inc'] < 4000000]

df_clean.value_counts

<bound method DataFrame.value_counts of        loan_amnt  int_rate grade emp_length  annual_inc    dti  \
0         3600.0     13.99     C  10+ years     55000.0   5.91   
1        24700.0     11.99     C  10+ years     65000.0  16.06   
2        20000.0     10.78     B  10+ years     63000.0  10.78   
3        35000.0     14.85     C  10+ years    110000.0  17.06   
4        10400.0     22.45     F    3 years    104433.0  25.37   
...          ...       ...   ...        ...         ...    ...   
14995     8000.0     12.59     C    7 years     79875.0   7.59   
14996    35000.0     17.86     D   < 1 year    160000.0  12.56   
14997     7275.0     11.22     B    5 years     30000.0  26.68   
14998    27000.0      9.76     B    7 years     57000.0  25.90   
14999    25000.0     11.99     C    5 years     90000.0  11.39   

       fico_range_low  revol_util             purpose home_ownership  \
0               675.0        29.7  debt_consolidation       MORTGAGE   
1               715.0  

In [None]:
# Replace DTI = 999 with NaN, then fill with median
df_clean.loc[df_clean['dti'] == 999, 'dti'] = np.nan
df_clean['dti'] = df_clean['dti'].fillna(df_clean['dti'].median())

#### Feature Engineering
##### Introducing the loan to income ratio and fico ranges for better prediction

In [None]:
# Create additional features
df_clean['loan_to_income'] = df_clean['loan_amnt'] / df_clean['annual_inc']
df_clean['fico_category'] = pd.cut(df_clean['fico_range_low'],
                                     bins=[0, 670, 740, 850],
                                     labels=['Fair', 'Good', 'Excellent'])

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
# Label encode ordinal features
le = LabelEncoder()
df_clean['grade_encoded'] = le.fit_transform(df_clean['grade'])

# One-hot encode nominal features (for modeling later)
df_encoded = pd.get_dummies(df_clean, columns=['purpose', 'home_ownership'], drop_first=True)

In [12]:
df_clean.to_csv('../raw_data/cleaned_loan_data.csv', index=False)

In [None]:

print("Cleaning up encoded dataset...")
print("="*60)

#  Create binary 'default' target if not already there
if 'default' not in df_encoded.columns:
    df_encoded['default'] = df_encoded['loan_status'].apply(
        lambda x: 1 if x == 'Charged Off' else 0
    )
    print(" Created binary 'default' column")

#  Encode emp_length
print("\nEncoding emp_length...")
df_encoded = pd.get_dummies(df_encoded, columns=['emp_length'], drop_first=True)
print(f" One-hot encoded emp_length")

#  Drop all the original categorical columns we don't need
columns_to_drop = [
    'grade',           # We have grade_encoded
    'loan_status',     # We have default (binary)
    'fico_category'    # We have fico_range_low (numeric)
]

print("\nDropping original categorical columns:")
for col in columns_to_drop:
    if col in df_encoded.columns:
        df_encoded = df_encoded.drop(col, axis=1)
        print(f"   Dropped: {col}")

#  Convert bool columns to int (cleaner for modeling)
bool_cols = df_encoded.select_dtypes(include=['bool']).columns
df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)
print(f"\n Converted {len(bool_cols)} boolean columns to integers")

#  Verify everything is numeric
print("\nFinal Data Types:")
print(df_encoded.dtypes.value_counts())

# Check for remaining non-numeric columns
non_numeric = df_encoded.select_dtypes(include=['object', 'category']).columns
if len(non_numeric) == 0:
    print("\n SUCCESS! All columns are numeric!")
else:
    print(f"\n Still have non-numeric columns: {non_numeric.tolist()}")

#  Summary
print("\n" + "="*60)
print("FINAL DATASET FOR MODELING")
print("="*60)
print(f"Shape: {df_encoded.shape}")
print(f"Total columns: {df_encoded.shape[1]}")
print(f"Features: {df_encoded.shape[1] - 1}")
print(f"Target: 'default'")
print(f"\nColumn names:")
print(df_encoded.columns.tolist())


Cleaning up encoded dataset...
 Created binary 'default' column

Encoding emp_length...
 One-hot encoded emp_length

Dropping original categorical columns:
   Dropped: grade
   Dropped: loan_status
   Dropped: fico_category

 Converted 24 boolean columns to integers

Final Data Types:
int64      26
float64     7
Name: count, dtype: int64

 SUCCESS! All columns are numeric!

FINAL DATASET FOR MODELING
Shape: (15000, 33)
Total columns: 33
Features: 32
Target: 'default'

Column names:
['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'fico_range_low', 'revol_util', 'loan_to_income', 'grade_encoded', 'purpose_credit_card', 'purpose_debt_consolidation', 'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase', 'purpose_medical', 'purpose_moving', 'purpose_other', 'purpose_renewable_energy', 'purpose_small_business', 'purpose_vacation', 'home_ownership_OWN', 'home_ownership_RENT', 'default', 'emp_length_10+ years', 'emp_length_2 years', 'emp_length_3 years', 'emp_length_4 years', '

In [16]:
output_path = '../data/processed/encoded_loan_data.csv'
df_encoded.to_csv(output_path, index=False)
print(f"\n SAVED: {output_path}")


 SAVED: ../data/processed/encoded_loan_data.csv
