In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset 
data = pd.read_csv('credit_risk_dataset.csv')

data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
# Filter dataset to only personal loans
data = data[data['loan_intent'] == 'PERSONAL']

In [4]:
# Display number of rows after filtering
print(f"Number of Rows After Filtering to Personal Loans: {len(data)}")

Number of Rows After Filtering to Personal Loans: 5521


In [5]:
# Display column names and count
print("Column Names:")
print(list(data.columns))
print(f"Number of Columns: {len(data.columns)}")

Column Names:
['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
Number of Columns: 12


In [6]:
# Display dataset information 
print("\nDataset Info:")
print(data.info())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 5521 entries, 0 to 32579
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  5521 non-null   int64  
 1   person_income               5521 non-null   int64  
 2   person_home_ownership       5521 non-null   object 
 3   person_emp_length           5369 non-null   float64
 4   loan_intent                 5521 non-null   object 
 5   loan_grade                  5521 non-null   object 
 6   loan_amnt                   5521 non-null   int64  
 7   loan_int_rate               5014 non-null   float64
 8   loan_status                 5521 non-null   int64  
 9   loan_percent_income         5521 non-null   float64
 10  cb_person_default_on_file   5521 non-null   object 
 11  cb_person_cred_hist_length  5521 non-null   int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 560.7+ KB
None


In [7]:
# Summary statistics for numerical columns
print("\nSummary Statistics for Numerical Columns:")
print(data.describe())


Summary Statistics for Numerical Columns:
        person_age  person_income  person_emp_length     loan_amnt  \
count  5521.000000   5.521000e+03        5369.000000   5521.000000   
mean     28.208477   6.786414e+04           4.888061   9573.772867   
std       7.263431   9.809568e+04           4.375188   6298.425223   
min      20.000000   4.200000e+03           0.000000    500.000000   
25%      23.000000   3.900000e+04           2.000000   5000.000000   
50%      26.000000   5.500000e+04           4.000000   8000.000000   
75%      30.000000   8.000000e+04           7.000000  12000.000000   
max     144.000000   6.000000e+06         123.000000  35000.000000   

       loan_int_rate  loan_status  loan_percent_income  \
count    5014.000000  5521.000000           5521.00000   
mean       10.998221     0.198877              0.16923   
std         3.228400     0.399191              0.10540   
min         5.420000     0.000000              0.00000   
25%         7.900000     0.000000   

In [8]:
# Handle outliers
# Cap person_emp_length at 50 years 
emp_length_cap = 50
data['person_emp_length'] = data['person_emp_length'].clip(upper=emp_length_cap)

# Cap person_age at 90 years
age_cap = 90
data['person_age'] = data['person_age'].clip(upper=age_cap)

# Summary statistics after outlier handling
print("\nSummary Statistics for Numerical Columns (After Outlier Handling):")
print(data.describe())


Summary Statistics for Numerical Columns (After Outlier Handling):
        person_age  person_income  person_emp_length     loan_amnt  \
count  5521.000000   5.521000e+03        5369.000000   5521.000000   
mean     28.198696   6.786414e+04           4.874465   9573.772867   
std       7.142837   9.809568e+04           4.113683   6298.425223   
min      20.000000   4.200000e+03           0.000000    500.000000   
25%      23.000000   3.900000e+04           2.000000   5000.000000   
50%      26.000000   5.500000e+04           4.000000   8000.000000   
75%      30.000000   8.000000e+04           7.000000  12000.000000   
max      90.000000   6.000000e+06          50.000000  35000.000000   

       loan_int_rate  loan_status  loan_percent_income  \
count    5014.000000  5521.000000           5521.00000   
mean       10.998221     0.198877              0.16923   
std         3.228400     0.399191              0.10540   
min         5.420000     0.000000              0.00000   
25%        

In [9]:
# Check for missing values
print("\nMissing Values:")
missing_values = data.isnull().sum()
print(missing_values)


Missing Values:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length             152
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 507
loan_status                     0
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64


In [10]:
# Handle missing values
# Drop rows with missing values in specified columns
numerical_cols = ['person_emp_length', 'loan_int_rate']
data = data.dropna(subset=numerical_cols)
    
# Verify no missing values remain
print("\nMissing Values After Imput:")
print(data.isnull().sum())


Missing Values After Imput:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


In [None]:
# Check unique values in categorical columns for get dummies
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
print("\nUnique Values in Categorical Columns:")
for col in categorical_cols:
    print(f"{col}: {data[col].nunique()} unique values - {data[col].unique()}")


Unique Values in Categorical Columns:
person_home_ownership: 4 unique values - ['RENT' 'OWN' 'MORTGAGE' 'OTHER']
loan_intent: 1 unique values - ['PERSONAL']
loan_grade: 7 unique values - ['D' 'A' 'E' 'C' 'B' 'F' 'G']
cb_person_default_on_file: 2 unique values - ['Y' 'N']


In [None]:
# Separate columns with get dummies
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [None]:
# Display new column names after get dummies
print("\nColumn Names After Get Dummies:")
print(list(data.columns))
print(f"Number of Columns After Get Dummies: {len(data.columns)}")


Column Names After Get Dummies:
['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_cred_hist_length', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_Y']
Number of Columns After Get Dummies: 18


In [14]:
# Explore correlations (for Visualization Lead)
print("\nCorrelation Matrix (Top Correlations with loan_status):")
correlation_matrix = data.corr()
print(correlation_matrix['loan_status'].sort_values(ascending=False))


Correlation Matrix (Top Correlations with loan_status):
loan_status                    1.000000
loan_percent_income            0.392484
person_home_ownership_RENT     0.282006
loan_int_rate                  0.253041
loan_grade_D                   0.227678
cb_person_default_on_file_Y    0.133687
loan_grade_E                   0.120384
loan_grade_G                   0.091383
loan_amnt                      0.082445
loan_grade_F                   0.060177
person_home_ownership_OTHER    0.043603
loan_grade_C                  -0.018435
loan_grade_B                  -0.033682
cb_person_cred_hist_length    -0.039190
person_age                    -0.056335
person_income                 -0.100842
person_home_ownership_OWN     -0.106008
person_emp_length             -0.132836
Name: loan_status, dtype: float64


In [None]:
#Display data for cleanliness
data.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,22,59000,50.0,35000,16.02,1,0.59,3,False,False,True,False,False,True,False,False,False,True
8,24,83000,8.0,35000,8.9,1,0.42,2,False,False,True,False,False,False,False,False,False,False
21,25,137000,9.0,34800,16.77,0,0.25,2,False,False,True,False,False,False,True,False,False,True
23,24,10980,0.0,1500,7.29,0,0.14,3,False,True,False,False,False,False,False,False,False,False
24,22,80000,3.0,33950,14.54,1,0.42,4,False,False,True,False,False,True,False,False,False,True


In [None]:
# Display dataset information 
print("\nDataset Info:")
print(data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 4877 entries, 0 to 32579
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   person_age                   4877 non-null   int64  
 1   person_income                4877 non-null   int64  
 2   person_emp_length            4877 non-null   float64
 3   loan_amnt                    4877 non-null   int64  
 4   loan_int_rate                4877 non-null   float64
 5   loan_status                  4877 non-null   int64  
 6   loan_percent_income          4877 non-null   float64
 7   cb_person_cred_hist_length   4877 non-null   int64  
 8   person_home_ownership_OTHER  4877 non-null   bool   
 9   person_home_ownership_OWN    4877 non-null   bool   
 10  person_home_ownership_RENT   4877 non-null   bool   
 11  loan_grade_B                 4877 non-null   bool   
 12  loan_grade_C                 4877 non-null   bool   
 13  loan_gr

In [18]:
# Save preprocessed data for further use 
data.to_csv('preprocessed_credit_risk.csv', index=False)