In [207]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve, auc

In [208]:
data = pd.read_csv("./data/Model_Dataset.csv", index_col=0)
info = pd.read_excel("./data/Data Dictionary.xlsx")

In [209]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 373412 entries, 929649 to 1350743
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   acc_open_past_24mths    373412 non-null  float64
 1   avg_cur_bal             373412 non-null  float64
 2   bc_open_to_buy          367952 non-null  float64
 3   delinq_2yrs             373412 non-null  float64
 4   dti                     371540 non-null  float64
 5   earliest_cr_line        371519 non-null  object 
 6   emp_title               351384 non-null  object 
 7   grade                   373412 non-null  object 
 8   inq_last_6mths          371549 non-null  float64
 9   issue_d                 373412 non-null  object 
 10  loan_amnt               373412 non-null  int64  
 11  mths_since_last_delinq  193106 non-null  float64
 12  num_tl_120dpd_2m        355867 non-null  float64
 13  percent_bc_gt_75        369543 non-null  float64
 14  purpose            

In [210]:
X = data.drop(["target","id"],axis=1)
y = data.target

num_col = X.dtypes[X.dtypes != "object"].index.to_list()
date_col = ["earliest_cr_line", "issue_d", "term"]
cat_col = X.dtypes[(X.dtypes == "object") & (~X.dtypes.index.isin(date_col))].index.to_list()

X_train,X_valid,y_train,y_valid = train_test_split(X,y,train_size=0.75,random_state=0)

# missing value

def missing_value_handler(df):

    df["emp_title"] = np.where(df["emp_title"].isna(), "unemployed or info unavailable", "employed")
    idx = df.loc[(X["mths_since_last_delinq"].isna()) & (df["delinq_2yrs"] > 0)].index
    df.drop(idx, inplace=True)
    df["mths_since_last_delinq"] = np.where((df["mths_since_last_delinq"].isna()) & (df["delinq_2yrs"] == 0), 24, df["mths_since_last_delinq"])
    df["mths_since_last_delinq"] = np.where(df["num_tl_120dpd_2m"] > 0, 0, df["mths_since_last_delinq"])
    df["num_tl_120dpd_2m"] = np.where(df["num_tl_120dpd_2m"].isna(), 0, df["num_tl_120dpd_2m"])

    df.dropna(inplace=True)

    df["term"] = df["term"].str.split(" ").apply(lambda x : x[1]).astype(int)
    df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"], format="%b-%Y").dt.strftime('%Y%m%d').astype(int)
    df["issue_d"] = pd.to_datetime(df["issue_d"], format="%Y-%m-%d").dt.strftime('%Y%m%d').astype(int)
    
    return df

def pipeline(df,train=True):
    
    df = missing_value_handler(df)
    if train:
        df = pd.DataFrame(oed.fit_transform(df),index=df.index,columns=df.columns)
    
    else:    
        df = pd.DataFrame(oed.transform(df),index=df.index,columns=df.columns)
        
    return df

oed = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=-1)

X_train = pipeline(X_train)
X_train

Unnamed: 0,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,delinq_2yrs,dti,earliest_cr_line,emp_title,grade,inq_last_6mths,issue_d,...,mths_since_last_delinq,num_tl_120dpd_2m,percent_bc_gt_75,purpose,revol_util,term,total_acc,verification_status,gender,age
958948,3.0,6623.0,11168.0,1.0,1427.0,513.0,0.0,2.0,0.0,11.0,...,13.0,0.0,0.0,6.0,173.0,1.0,25.0,2.0,0.0,41.0
941019,7.0,3351.0,3648.0,0.0,2678.0,623.0,0.0,1.0,0.0,11.0,...,35.0,0.0,102.0,1.0,609.0,0.0,10.0,1.0,0.0,14.0
1078494,2.0,7354.0,477.0,1.0,2438.0,557.0,0.0,1.0,1.0,8.0,...,11.0,0.0,123.0,2.0,831.0,0.0,2.0,0.0,0.0,14.0
1331700,4.0,40768.0,5015.0,3.0,844.0,540.0,0.0,3.0,0.0,0.0,...,9.0,0.0,0.0,7.0,365.0,0.0,17.0,0.0,1.0,6.0
1088981,5.0,5279.0,11156.0,0.0,3124.0,388.0,0.0,1.0,1.0,8.0,...,72.0,0.0,0.0,2.0,383.0,0.0,35.0,1.0,1.0,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274334,11.0,10701.0,2042.0,0.0,2797.0,523.0,0.0,3.0,0.0,2.0,...,25.0,0.0,123.0,2.0,602.0,1.0,37.0,2.0,1.0,0.0
1333960,6.0,4288.0,1245.0,0.0,2642.0,481.0,0.0,2.0,1.0,0.0,...,36.0,0.0,134.0,2.0,659.0,0.0,29.0,1.0,1.0,26.0
1103917,1.0,6776.0,5164.0,0.0,2455.0,441.0,0.0,2.0,0.0,7.0,...,65.0,0.0,123.0,2.0,736.0,1.0,14.0,1.0,1.0,43.0
1064555,7.0,5413.0,3167.0,0.0,975.0,346.0,0.0,2.0,1.0,8.0,...,24.0,0.0,123.0,2.0,641.0,0.0,11.0,1.0,1.0,20.0


In [211]:
X_train, y_train = X_train.align(y_train,join="inner",axis=0)
X_valid = pipeline(X_valid,train=False)
X_valid,y_valid = X_valid.align(y_valid,join="inner",axis=0)

# first iteration
def model_fit(train_x,test_x,n=100):
    clf = RandomForestClassifier(random_state=0, n_estimators=n)
    clf.fit(train_x,y_train)
    y_pred = clf.predict(test_x)
    print(accuracy_score(y_valid,y_pred))
model_fit(X_train,X_valid)

0.8070762130889877


In [212]:
# transformation
X_train_trans = X_train[num_col + date_col] - X_train[num_col + date_col].mean() / X_train[num_col + date_col].std()
X_train_trans = X_train_trans.join(X_train[cat_col])
X_valid_trans = X_valid[num_col + date_col] - X_valid[num_col + date_col].mean() / X_valid[num_col + date_col].std()
X_valid_trans = X_valid_trans.join(X_valid[cat_col])

# feature selection
corr_table = X_train_trans[num_col].corr()
cor_loc = np.argwhere(abs(corr_table) > 0.7)
display(cor_loc[cor_loc[:,0] != cor_loc[:,1]])

VIF = np.linalg.inv(corr_table).diagonal()
display(VIF)

X_train_trans = X_train_trans.drop(X_train_trans.iloc[:,10].name,axis=1)
X_valid_trans = X_valid_trans.drop(X_valid_trans.iloc[:,10].name,axis=1)

# second iteration
model_fit(X_train_trans,X_valid_trans)

array([[ 9, 10],
       [10,  9]], dtype=int64)

array([1.45703077, 1.14221129, 1.77144078, 1.2403107 , 1.15397482,
       1.12121727, 1.24840353, 1.20125069, 1.00313614, 2.21397544,
       2.55390018, 1.47968412, 1.0000288 ])

0.806920152938947


In [213]:
# imbalanced sample boosting

"""
can use imbalanced-learn package to deal with imbalanced data

"""

# fine-tuning

"""
can use grid search together with cross-validation for more advanced hyperparam tuning

"""
for i in [100, 200, 500, 700]:
    model_fit(X_train_trans, X_valid_trans,i)

0.806920152938947
0.8086145202822459
0.8083023999821646
0.808581078821523
