In [3]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.utils import  resample
import joblib

### Load and sample the data

In [4]:
df = pd.read_csv(r"../credit_risk_dataset/credit_risk_dataset.csv")

# Check the balance of the target data
target_ratio = df["loan_status"].value_counts()
print(f"Unbalanced target ratio: {target_ratio}\n")

def random_oversampling(df:pd.DataFrame, target:str):
    # Balance out the target variables by increasing the number of samples in the minority class
    
    df_majority = df[df[target] == 0]  # Majority class (low risk)
    df_minority = df[df[target] == 1]  # Minority class (high risk)

    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
        replace=True,    # Sample with replacement
        n_samples=len(df_majority),  # Match majority class size
        random_state=42)

    # Combine majority class with upsampled minority class
    df_balanced = pd.concat([df_majority, df_minority_upsampled])

    # Shuffle dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Balanced target ratio: {df_balanced[target].value_counts()}")
    
    return df_balanced

def random_undersampling(df:pd.DataFrame, target:str):
    # Reduces the number of samples from the majority class
    
    df_majority = df[df[target] == 0]  # Majority class (low risk)
    df_minority = df[df[target] == 1]  # Minority class (high risk)
    
    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
        replace=False,    # Sample without replacement
        n_samples=len(df_minority),  # Match minority class size
        random_state=42)

    # Combine downsampled majority class with minority class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Shuffle dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Balanced target ratio: {df_balanced[target].value_counts()}")
    
    return df_balanced

balanced_df = random_undersampling(df, "loan_status")

sample_size = 350  # Define sample size
df, _ = train_test_split(balanced_df, train_size=sample_size, stratify=balanced_df["loan_status"], random_state=42)  # Sample the data

Unbalanced target ratio: loan_status
0    25473
1     7108
Name: count, dtype: int64

Balanced target ratio: loan_status
0    7108
1    7108
Name: count, dtype: int64


### Clean Up Data

In [5]:
# "loan_int_rate", "person_emp_length" :  Cols with missing values

def replace_incorrect_vals(col_val:int):
    # Replaces incorrect employemnt length values with the mean of the column
    median_val = df["person_emp_length"].median()
    
    if col_val > 47:  # Max year assumes that employee started at 18 and retired at 64
        return median_val
    else:
        return col_val

df["person_emp_length"] = df["person_emp_length"].apply(replace_incorrect_vals)
df["person_emp_length"].fillna(df['person_emp_length'].median(), inplace=True)  # Fill the rest of the missing values with median
df["loan_int_rate"].fillna(round(df['loan_int_rate'].mean(), 2), inplace=True)  # Fill missing values with mean

# Check for duplicated rows
duplicate_rows = df.duplicated().sum()
print(f"Amount of duplicate rows: {duplicate_rows}\n\n")  # 0 duplicate rows

df["cb_person_default_on_file"] = df["cb_person_default_on_file"].replace({'N':0, 'Y':1})  # Replace N and Y with 0 and 1

# Check datatypes of the columns
print(df.dtypes)

# One hot encode the dataframe
df = pd.get_dummies(df, drop_first=False)

df.columns = df.columns.sort_values(ascending=True)  # Sort the columns in ascending order

# Save one hot encoded columns
ohe_features = df.drop("loan_status", axis=1).columns  # Get all One Hot Encoded Featuresd
joblib.dump(ohe_features, r"../CreditRiskApp/resources/ohe_cols.pkl")

df.head()

Amount of duplicate rows: 0


person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file       int64
cb_person_cred_hist_length      int64
dtype: object


Unnamed: 0,cb_person_cred_hist_length,cb_person_default_on_file,loan_amnt,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_int_rate,...,loan_intent_VENTURE,loan_percent_income,loan_status,person_age,person_emp_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,person_income
13610,26,33996,1.0,9600,9.88,0,0.28,0,2,False,...,False,True,False,False,False,True,False,False,False,False
4858,24,39000,2.0,16000,19.91,1,0.41,1,3,False,...,False,False,False,False,False,False,False,False,True,False
5628,23,92000,5.0,2400,12.69,0,0.03,0,2,True,...,False,False,False,False,False,True,False,False,False,False
1837,21,28000,3.0,8000,10.99,1,0.29,0,2,False,...,False,False,False,False,False,True,False,False,False,False
4636,33,56100,13.0,16000,6.03,0,0.29,0,8,False,...,True,False,False,False,True,False,False,False,False,False


### Save Features and Target values

In [6]:
# Define features (X) and target (y) columns
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

# Convert into csv and save features and target
X.to_csv("features.csv", index=False)
y.to_csv("target.csv", index=False)