In [11]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.utils import  resample
import joblib

### Load and sample the data

In [12]:
df = pd.read_csv(r"../credit_risk_dataset/credit_risk_dataset.csv")

# Check the balance of the target data
target_ratio = df["loan_status"].value_counts()
print(f"Unbalanced target ratio: {target_ratio}\n")

def random_oversampling(df:pd.DataFrame, target:str):
    # Balance out the target variables by increasing the number of samples in the minority class
    
    df_majority = df[df[target] == 0]  # Majority class (low risk)
    df_minority = df[df[target] == 1]  # Minority class (high risk)

    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
        replace=True,    # Sample with replacement
        n_samples=len(df_majority),  # Match majority class size
        random_state=42)

    # Combine majority class with upsampled minority class
    df_balanced = pd.concat([df_majority, df_minority_upsampled])

    # Shuffle dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Balanced target ratio: {df_balanced[target].value_counts()}")
    
    return df_balanced

def random_undersampling(df:pd.DataFrame, target:str):
    # Reduces the number of samples from the majority class
    
    df_majority = df[df[target] == 0]  # Majority class (low risk)
    df_minority = df[df[target] == 1]  # Minority class (high risk)
    
    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
        replace=False,    # Sample without replacement
        n_samples=len(df_minority),  # Match minority class size
        random_state=42)

    # Combine downsampled majority class with minority class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Shuffle dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Balanced target ratio: {df_balanced[target].value_counts()}")
    
    return df_balanced

balanced_df = random_undersampling(df, "loan_status")

# sample_size = 10000  # Define sample size
# balanced_df, _ = train_test_split(balanced_df, train_size=sample_size, stratify=balanced_df["loan_status"], random_state=42)  # Sample the data

Unbalanced target ratio: loan_status
0    25473
1     7108
Name: count, dtype: int64

Balanced target ratio: loan_status
0    7108
1    7108
Name: count, dtype: int64


### Clean Up Data

In [None]:
# "loan_int_rate", "person_emp_length" :  Cols with missing values

def replace_incorrect_vals(col_val:int):
    # Replaces incorrect employemnt length values with the mean of the column
    median_val = df["person_emp_length"].median()
    
    if col_val > 47:  # Max year assumes that employee started at 18 and retired at 64
        return median_val
    else:
        return col_val
    

balanced_df["person_emp_length"] = balanced_df["person_emp_length"].apply(replace_incorrect_vals)
balanced_df["person_emp_length"].fillna(df['person_emp_length'].median(), inplace=True)  # Fill the rest of the missing values with median
# balanced_df["loan_int_rate"].fillna(round(df['loan_int_rate'].mean(), 2), inplace=True)  
balanced_df["loan_int_rate"] = balanced_df.groupby("loan_status")["loan_int_rate"].transform(lambda x: x.fillna(x.median())) # Fill missing values with median

# Check for duplicated rows
duplicate_rows = balanced_df.duplicated().sum()
print(f"Amount of duplicate rows: {duplicate_rows}\n\n")  # 0 duplicate rows

balanced_df["cb_person_default_on_file"] = balanced_df["cb_person_default_on_file"].replace({'N':"No", 'Y':"Yes"})  # Replace N and Y with 0 and 1

# Check datatypes of the columns
print(balanced_df.dtypes)

# Save non One Hot Encoded Columns
cols = balanced_df.drop("loan_status", axis=1).columns  # Store all non One Hot Encoded features
joblib.dump(cols, r"../CreditRiskApp/resources/cols.pkl")

# One hot encode the dataframe
balanced_df = pd.get_dummies(balanced_df, drop_first=False)

# Save one hot encoded columns
ohe_features = balanced_df.drop("loan_status", axis=1).columns  # Get all One Hot Encoded Features
joblib.dump(ohe_features, r"../CreditRiskApp/resources/ohe_cols.pkl")

df.head()

Amount of duplicate rows: 34


person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
dtype: object


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [14]:
# Print the correlations of all the fetures to the target
balanced_df_corr = balanced_df.corr()["loan_status"].drop("loan_status")

print(balanced_df_corr)

person_age                       -0.023170
person_income                    -0.164663
person_emp_length                -0.102696
loan_amnt                         0.119423
loan_int_rate                     0.405588
loan_percent_income               0.401856
cb_person_cred_hist_length       -0.016554
person_home_ownership_MORTGAGE   -0.234007
person_home_ownership_OTHER       0.010018
person_home_ownership_OWN        -0.139859
person_home_ownership_RENT        0.292931
loan_intent_DEBTCONSOLIDATION     0.085718
loan_intent_EDUCATION            -0.078831
loan_intent_HOMEIMPROVEMENT       0.044475
loan_intent_MEDICAL               0.073138
loan_intent_PERSONAL             -0.022111
loan_intent_VENTURE              -0.102802
loan_grade_A                     -0.260571
loan_grade_B                     -0.117049
loan_grade_C                     -0.010880
loan_grade_D                      0.313622
loan_grade_E                      0.170925
loan_grade_F                      0.090193
loan_grade_

### Save Features and Target values

In [15]:
# Define features (X) and target (y) columns
X = balanced_df.drop("loan_status", axis=1)
y = balanced_df["loan_status"]

# Convert into csv and save features and target
X.to_csv("features.csv", index=False)
y.to_csv("target.csv", index=False)