In [1]:
import pandas as pd

import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [2]:
df = pd.read_csv('data/Loan_Default.csv')
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [3]:
NUMERICAL_VARIBLES = ['loan_amount', 'rate_of_interest', 'Upfront_charges', 'property_value', 'income', 'LTV', 'dtir1']
BINARY_VARIBLES = ['loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization',
                   'interest_only', 'lump_sum_payment', 'construction_type', 'Secured_by', 'co-applicant_credit_type', 'age', 'Security_Type']

CATEGORICAL_VARIBLES = ['Gender', 'loan_type', 'loan_purpose', 'term', 'occupancy_type', 'total_units', 'credit_type', 'Region']

FEATURES = NUMERICAL_VARIBLES + BINARY_VARIBLES + CATEGORICAL_VARIBLES

In [4]:
df.isna().sum()

ID                               0
year                             0
loan_limit                    3442
Gender                           0
approv_in_adv                  935
loan_type                        0
loan_purpose                   135
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             37333
Interest_rate_spread         37538
Upfront_charges              40623
term                            43
Neg_ammortization              122
interest_only                    0
lump_sum_payment                 0
property_value               15485
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        9358
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            205
submission_of_applic

In [5]:
X = df

In [6]:
transformers = {}
for num_col in NUMERICAL_VARIBLES:
    group_means = X.groupby('Status')[num_col].transform('mean')
    X[num_col] = X[num_col].fillna(group_means)
    std_scaler = StandardScaler()
    std_scaler.fit(X[num_col].values.reshape(-1,1))
    transformers[num_col] = std_scaler

In [7]:
for num_col in NUMERICAL_VARIBLES:
    X.loc[:,num_col] = transformers[num_col].transform(X[num_col].values.reshape(-1, 1))

In [8]:
for cat_col in CATEGORICAL_VARIBLES:
    X[cat_col] = X[cat_col].fillna(X[cat_col].mode()[0])
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(X[cat_col].values.reshape(-1,1))
    transformers[cat_col] = ohe

In [9]:
for cat_col in CATEGORICAL_VARIBLES:
    encoded_features = transformers[cat_col].transform(X[cat_col].values.reshape(-1,1)).toarray()
    encoded_df = pd.DataFrame(encoded_features, columns=transformers[cat_col].get_feature_names_out([cat_col]))
    X = X.reset_index(drop=True)
    encoded_df = encoded_df.reset_index(drop=True)
    X = pd.concat([X, encoded_df], axis=1)
    X = X.drop(columns=[cat_col])

In [10]:
for bin_col in BINARY_VARIBLES:
    X[bin_col] = X[bin_col].fillna(X[bin_col].mode()[0])
    if bin_col != 'age':
        lab_en = LabelEncoder()
        lab_en.fit(X[bin_col])
        transformers[bin_col] = lab_en

In [11]:
for bin_col in BINARY_VARIBLES:
    if bin_col != 'age':
        X[bin_col]= transformers[bin_col].transform(X[bin_col])
        

In [12]:
age_range_dict = {
    "<25": 1,
    "25-34": 2,
    "35-44": 3,
    "45-54": 4,
    "55-64": 5,
    "65-74": 6,
    ">74": 7
}

X['age'] = X['age'].replace(age_range_dict)

In [13]:
X.isna().sum()

ID                   0
year                 0
loan_limit           0
approv_in_adv        0
Credit_Worthiness    0
                    ..
credit_type_EXP      0
Region_North         0
Region_North-East    0
Region_central       0
Region_south         0
Length: 78, dtype: int64

In [15]:
pickle.dump(transformers, open('artifacts/transformer.pkl', 'wb'))

In [None]:
transformers['Gender']