In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [25]:
loan_default_df = pd.read_csv("../data/raw/Loan_default.csv").drop("LoanID", axis=1)
loan_default_df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [26]:
ld_transformed = loan_default_df.copy()

# Ordinal Variables

In [27]:
map_dict_education = {
    "High School": 1,
    "Bachelor's": 2,
    "Master's": 3,
    "PhD": 4
}
map_dict_employment = {
    'Unemployed': 1,
    'Part-time': 2,
    'Self-employed': 3,
    'Full-time': 4
}

ld_transformed["Education"] = ld_transformed["Education"].map(map_dict_education)
ld_transformed["EmploymentType"] = ld_transformed["EmploymentType"].map(map_dict_employment)

# One-Hot Encoding and dummies

In [28]:
ohe_variables = ["MaritalStatus", "LoanPurpose"]

ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False, dtype="int8")
ohe_array = ohe.fit_transform(loan_default_df[ohe_variables])
ohe_output_col_names = ohe.get_feature_names_out()

ohe_df = pd.DataFrame(ohe_array, columns=ohe_output_col_names)
ld_transformed.drop(ohe_variables, axis=1, inplace=True)

ld_transformed = pd.concat([ld_transformed, ohe_df], axis=1)

In [29]:
dummies_variables = ["HasMortgage", "HasDependents", "HasCoSigner"]

ld_transformed[dummies_variables] = pd.get_dummies(
    ld_transformed[dummies_variables],
    drop_first="True",
    dtype="int8"
)

# Standarization

In [30]:
cat_var_and_label = list(ohe_output_col_names) + list(dummies_variables) + ["Default"] + ["Education", "EmploymentType"]
num_cols = ld_transformed.columns[~ld_transformed.columns.isin(cat_var_and_label)]

sc = StandardScaler()
standarized_output = sc.fit_transform(ld_transformed[num_cols])

ld_transformed[num_cols] = standarized_output

In [31]:
num_cols

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio'],
      dtype='object')

# Data Types

In [32]:
data_types_dict = {
    "Default":"int8"
}

ld_transformed = ld_transformed.astype(data_types_dict)

# Saving

In [33]:
all_cols = list(ld_transformed.columns)
label_column = "Default"

all_cols.remove(label_column)

ld_transformed = ld_transformed[[label_column] + all_cols]

In [34]:
ld_transformed.to_parquet("../data/processed/loan_default.parquet")

# Aproximation Processing code

In [4]:
import argparse
from glob import glob

import pandas as pd

from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    OrdinalEncoder
)
from sklearn.compose import ColumnTransformer


csv_files = glob("../data/raw/*csv")

if len(csv_files) == 1:
    df = pd.read_csv(csv_files[0])
elif len(csv_files) > 1:
    df = pd.concat([pd.read_csv(file) for file in csv_files])
else:
    raise ValueError("Zero csv files were found. Check at least one CSV is present in the respective folder") 

df.drop("LoanID", inplace=True, axis=1)


# Defining the variables and expected values 
# to transform (ordinal, nominal and numeric variables)
ordinal_var = {
    "Education": [["High School", "Bachelor's", "Master's", "PhD"]],
    "EmploymentType": [['Unemployed', 'Part-time', 'Self-employed', 'Full-time']]
}
nominal_var = ["MaritalStatus", "LoanPurpose", "HasMortgage", "HasDependents", "HasCoSigner"]
numeric_var = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']


# Defining encoding to ordinal variables
ordinal_pipeline = []
for var_name, var_labels in ordinal_var.items():
    encoder = OrdinalEncoder(
        categories=var_labels,
        handle_unknown='use_encoded_value',
        unknown_value=-1,
        dtype="int16"
    )
    encoder_transformer = (var_name[:3].lower(), encoder, [var_name])
    ordinal_pipeline.append(encoder_transformer)

# Defining encoding to nominal variables
one_hot_encoder = OneHotEncoder(
    drop="first",
    handle_unknown="ignore",
    sparse_output=False, dtype="int8"
)

# Defining encoding to transform numeric variables
standarization = StandardScaler()


# Pipeline (ordinal + nominal [One Hot Encoder] + numeric [Standarization])
transformer_pipeline = ordinal_pipeline
transformer_pipeline += [
    ("ohe", one_hot_encoder, nominal_var),
    ("standarization", standarization, numeric_var)
]

transformer = ColumnTransformer(
    transformers=transformer_pipeline,
    remainder='passthrough',
    verbose_feature_names_out=True
)

transformed_data = transformer.fit_transform(df)

transformed_data = pd.DataFrame(
    transformed_data,
    columns=transformer.get_feature_names_out()
)
y = transformed_data.pop("remainder__Default").astype("int8")
transformed_data = pd.concat([y, transformed_data], axis=1)

transformed_data.to_parquet("../data/processed/processed_data.parquet", index=False)