# Future Loan Prediction

## Reminder: Why choose tree models

- scale invariant
- robust to multi-collinearity
- robust to outliers
- lightgbm additionally handles categorical variables as well.

In [83]:
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')


In [84]:
train_df = pd.read_csv("../data/processed/final_train_features.csv")
test_df = pd.read_csv("../data/processed/final_test_features.csv")

In [85]:

# A parameter grid for XGBoost. 
params = {
        'min_child_weight': [1, 5, 10],
        'max_depth': [3, 4],
        'learning_rate' : [0.001, 0.01,0.1],
        'colsample_bytree': [0.1, 0.2],
        }


clf = lgb.LGBMClassifier(
    verbose_eval=False,
    objective = 'binary',
    boosting_type = 'gbdt',
    seed= 0,
    verbose= -1,
    metric = 'auc',
    nthread = 16,
)
folds = 3

## always use stratified sampling
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
selected_features = train_df.drop(["Target"], axis= 1, inplace=False).columns
selected_features

Index(['loan_id', 'business_id', 'credit_officer_id', 'acquisition_channel',
       'sector', 'principal', 'total_owing_at_issue', 'application_number',
       'applying_for_loan_number', 'loan_number', 'employee_count',
       'approval_status', 'paid_late', 'total_recovered_on_time',
       'total_recovered_15_dpd', 'cash_yield_15_dpd',
       'repayment_duration_days'],
      dtype='object')

In [86]:
#ensure that objects are seen as categorical
def ensure_Object_to_category(df):
    '''
    Ensure that obj is convertated to category to avoid error
    '''
    for i in df.columns:
        if df[i].dtype.name == 'object':
            df[i] = df[i].astype ('category')
    return df


train_df = ensure_Object_to_category(train_df)
test_df = ensure_Object_to_category(test_df)

target = "Target"
selected_train_features = train_df.drop([target], axis= 1, inplace=False).columns
selected_test_features = test_df.drop([target], axis= 1, inplace=False).columns
#data_features = train_df[selected_features]
#data_target =  train_df[target]


# X_train , X_test, y_train, y_test = train_test_split(data_features, data_target, test_size=0.01, random_state=42)
X_train = train_df[selected_train_features]
X_test = test_df[selected_test_features]
y_train = train_df[target].tolist()
y_test = test_df[target].tolist()

grid_search = GridSearchCV(clf, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3 )

# Here we go
grid_search.fit(X_train, y_train)



Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [87]:
X_train1 = train_df[selected_features]
X_test.shape

(2000, 17)

In [88]:
print(grid_search.best_params_, grid_search.best_score_)

{'colsample_bytree': 0.1, 'learning_rate': 0.001, 'max_depth': 3, 'min_child_weight': 1} 1.0


In [89]:
feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = selected_features
feature_importance_df["importance"] = grid_search.best_estimator_.feature_importances_
feature_importance_df.sort_values("importance", ascending = False)

Unnamed: 0,feature,importance
16,repayment_duration_days,87
9,loan_number,76
1,business_id,69
14,total_recovered_15_dpd,68
5,principal,59
2,credit_officer_id,47
7,application_number,47
6,total_owing_at_issue,41
15,cash_yield_15_dpd,27
0,loan_id,22


In [90]:
from sklearn.metrics import roc_auc_score

classifier = grid_search.best_estimator_
classifier.fit(X_train,y_train)
y_train_probs = classifier.predict_proba(X_train)[:, 1]
y_test_probs = classifier.predict_proba(X_test)[:, 1]
train_auc = roc_auc_score(y_train, y_train_probs)
test_auc = roc_auc_score(y_test, y_test_probs)
print(f"Model results. Train AUC : {train_auc}. Test AUC : {test_auc}")

Model results. Train AUC : 1.0. Test AUC : 0.963620751988431


#### _Observation_ 
- AUC value is very heigh
- this does not indicate the final decision to approve or reject. But it mean that the model is able to differentiate default from non default


In [91]:
from src.utils import ks 
test_result_df = pd.DataFrame({"target" : y_test , "proba": y_test_probs })
summary_table = ks(test_result_df,"target","proba")
summary_table

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,total,bucket_event_rate,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.032724,0.049848,97,103,200,0.485,62.18%,5.59%,62.18%,5.59%,56.6
2,0.028749,0.03271,59,141,200,0.295,37.82%,7.65%,100.00%,13.23%,86.8
3,0.028639,0.028749,0,198,198,0.0,0.00%,10.74%,100.00%,23.97%,76.0
4,0.028552,0.028639,0,202,202,0.0,0.00%,10.95%,100.00%,34.92%,65.1
5,0.028385,0.028551,0,200,200,0.0,0.00%,10.85%,100.00%,45.77%,54.2
6,0.028195,0.028384,0,200,200,0.0,0.00%,10.85%,100.00%,56.62%,43.4
7,0.028076,0.028193,0,200,200,0.0,0.00%,10.85%,100.00%,67.46%,32.5
8,0.027932,0.028076,0,200,200,0.0,0.00%,10.85%,100.00%,78.31%,21.7
9,0.027811,0.027932,0,200,200,0.0,0.00%,10.85%,100.00%,89.15%,10.8
10,0.027517,0.027811,0,200,200,0.0,0.00%,10.85%,100.00%,100.00%,0.0


In [97]:
cat = train_df.select_dtypes(include='category').columns
cat

Index(['acquisition_channel', 'sector', 'approval_status'], dtype='object')