In [1]:
import pandas as pd
import warnings
import pickle
import numpy as np
import statsmodels.api as sm
import re
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100

### Load Models & Objects

In [2]:
# Load one hot encoder for pd model
pd_enc = pickle.load(open( "PD_One_hot.pkl", "rb" ))

# Load one hot encoder for LGD EAD Model
lgd_ead_enc = pickle.load(open( "LGD_EDA_One_hot.pkl", "rb" ))

# Load min max scaler for LGD EAD Model
lgd_ead_scaler = pickle.load(open( "LGD_EDA_Scaler.pkl", "rb" ))

# Load pd  Model
pd_model = pickle.load(open("PD_Model.pkl", "rb" ))

# Load LGD  Model
lgd_model = pickle.load(open("LGD_Model.pkl", "rb" ))

# Load EAD  Model
ead_model = pickle.load(open("EAD_Model.pkl", "rb" ))

### Preprocess routine for PD Model

In [3]:
# Preprocess routine for PD Mode

# function for converting dtito category
def get_dti_cat(dti):
    if dti < 8:
        cat = 'below_8'
    elif dti >= 8 and dti < 16:
        cat = 'bet_8_16'
    elif dti >= 16 and dti < 24 :
        cat = 'bet_16_24'
    elif dti >= 24 and dti < 32:
        cat = 'bet_24_32'
    else :
        cat = 'above_32'
    return cat

# function for converting loan amount to category
def get_loan_cat(amount):
    if amount < 7400:
        cat = 'below_74'
    elif amount >= 7400 and amount < 14300:
        cat = 'bet_74_143'
    elif amount >= 14300 and amount < 21200 :
        cat = 'bet_143_212'
    elif amount >= 21200 and amount < 28100:
        cat = 'bet_212_281'
    else :
        cat = 'above_281'
    return cat

def process_raw(raw,enc):
    # select feature
    imp_feature = ['id','member_id','grade','verification_status','loan_amnt','term','dti','purpose']
    df = raw[imp_feature].copy()
    
    # Coarse Classing Purpose Variable
    purpose_coarse_classing = {'moving': 'cat_1','other':'cat_1','renewable_energy':'cat_1',
                               'house':'cat_2','vacation':'cat_2','educational':'cat_2',
                               'credit_card':'cat_3','home_improvement':'cat_3',
                               'major_purchase':'cat_4','wedding':'cat_4','car':'cat_4',
                               'medical':'cat_5','debt_consolidation':'cat_5'}

    df['purpose'].replace(purpose_coarse_classing,inplace=True)
    
    # Discretize Loan amount
    df['loan_amnt'] = df['loan_amnt'].apply(get_loan_cat)
    df['loan_amnt'] = df['loan_amnt'].astype(str)
    
    # Discretize dti amount
    df['dti'] = df['dti'].apply(get_dti_cat)
    df['dti'] = df['dti'].astype(str)
    
    # converting term from str to int
    df['term'] = df['term'].apply(lambda x : x.split()[0])
    df['term'] = df['term'].astype(int)
    
    # one hot category columns
    cat_columns = ['grade','verification_status','loan_amnt','term','dti','purpose']
    one_hot_vec = enc.transform(df[cat_columns])
    one_hot_vec = one_hot_vec.toarray()
    
    one_hot_df = pd.DataFrame(one_hot_vec,columns=enc.get_feature_names(),index=df.index)
    
    # converting columns name to more meaning full names
    one_hot_df.columns = one_hot_df.columns.str.replace('x0_','Grade:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x1_','Status:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x2_','Loan:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x3_','Term:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x4_','DTI:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x5_','Purpose:')
    one_hot_df.columns = one_hot_df.columns.str.replace(' ','_')
    
    # drop cateory columns
    df.drop(cat_columns,axis='columns',inplace=True)
    
    # concat dataframes
    df = pd.concat([df,one_hot_df],axis=1,sort=False)
    
    return df

In [4]:
# function for computing credit score
def get_credit_score(df,score_card_path='score.csv'):
    
    #copy the df
    df = df.copy()
    
    #adding intercept
    df['Intercept'] = 1 

    # load scorecard
    score = pd.read_csv(score_card_path,index_col=0)
    
    # list of features used in the credit score calculation
    feature = score.index
    
    # Computing credit score with using dot product
    credit_score = df[feature].dot(score)
    
    return credit_score   

### Routine for processing LGD EAD Models

In [5]:
# Routine for processing LGD EAD Models
def prepare_data(df,enc,scaler):
    clean_df = df.copy()
    
    # cleanning term variable
    clean_df['term'] = clean_df['term'].str.split().str[0]
    clean_df['term'] = clean_df['term'].astype(int)
    
    # filling nan and < 1 year to 0 years in employment length
    clean_df['emp_length'].replace({np.nan :'0 year','< 1 year':'0 year'},inplace=True)
    
    # extracting employment years from str using regex
    clean_df['emp_length'] = clean_df['emp_length'].apply(lambda x : re.findall(r'\d+', x)[0])

    # converting emp_length from str to int
    clean_df['emp_length'] = clean_df['emp_length'].astype(int)

    # converting earliest_cr_line to pandas datetime format
    clean_df['earliest_cr_line'] = pd.to_datetime(clean_df['earliest_cr_line'],format='%b-%y')
    
    
    # we see that last year is 2068 which is impossible as current dataset is dated 2014.
    # subtracting 100 from the year where year value is greater than 2015
    clean_df['earliest_cr_line'] = clean_df['earliest_cr_line'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 2014 else x)
    
    # converting issue_d to pandas datetime format
    clean_df['issue_d'] = pd.to_datetime(clean_df['issue_d'],format='%b-%y')
    
    
    # filling missing earlies cr line with issue date
    clean_df['earliest_cr_line'] = np.where(clean_df['earliest_cr_line'].isna(),clean_df['issue_d'],clean_df['earliest_cr_line'])
    
    # creating new feature : Credit History lenght at the time of loan issue
    clean_df['credit_hist'] = clean_df['issue_d'] - clean_df['earliest_cr_line']
    
    # converting credit hist to months
    clean_df['credit_hist'] = clean_df['credit_hist'].dt.days//30
    clean_df['credit_hist'] = clean_df['credit_hist'].astype(int)
    
    # computing %principal outstanding, %principal paid,  % interest paid on principal, 
    # %installment on principal, %annual income
    clean_df['per_pymnt'] = clean_df['total_pymnt']/clean_df['funded_amnt']
    clean_df['per_rec_prncp'] = clean_df['total_rec_prncp']/clean_df['funded_amnt']
    clean_df['per_installment'] = clean_df['installment']/clean_df['funded_amnt']
    clean_df['per_annual_inc'] = clean_df['annual_inc']/clean_df['funded_amnt']
    
    # final numeric features
    numeric_col = ['term', 'int_rate', 'emp_length', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
                   'total_rec_late_fee', 'credit_hist','per_pymnt', 'per_rec_prncp', 'per_installment','per_annual_inc']
    
    # final catgeory features
    category_col = ['grade', 'home_ownership','verification_status', 'purpose','initial_list_status']
    
    # final feature set
    clean_df = clean_df[numeric_col+category_col].copy()
    
    # "ANY" category was not present in training so replacing it with NONE
    clean_df['home_ownership'].replace({"ANY":"NONE"},inplace=True)
    
    # fill na
    clean_df = clean_df.fillna(0)
    
    # min max scaling
    scaled_df = scaler.transform(clean_df[numeric_col])
    scaled_df = pd.DataFrame(scaled_df, columns=numeric_col)
    
    # one hot encoding
    one_hot_vec = enc.transform(clean_df[category_col])
    one_hot_df = pd.DataFrame(one_hot_vec.toarray(),columns=enc.get_feature_names())
    
    # converting columns name to more meaning full names
    one_hot_df.columns = one_hot_df.columns.str.replace('x0_','grade_')
    one_hot_df.columns = one_hot_df.columns.str.replace('x1_','home_ownership_')
    one_hot_df.columns = one_hot_df.columns.str.replace('x2_','verification_status_')
    one_hot_df.columns = one_hot_df.columns.str.replace('x3_','purpose_')
    one_hot_df.columns = one_hot_df.columns.str.replace('x4_','initial_list_status_')
    one_hot_df.columns = one_hot_df.columns.str.replace(' ','_')
    
    # final df
    final_df = pd.concat([scaled_df,one_hot_df],axis=1,sort=False)
    
   
    # add id columns
    final_df.insert(loc=0,column='member_id',value= df['member_id'].values)
    final_df.insert(loc=0,column='id',value= df['id'].values)
    
    return final_df   

### Load New Dataset

In [6]:
# load new data
data = pd.read_csv('loan_data_2015.csv')
data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,68444620,73334399,35000,35000,35000.0,60 months,11.99,778.38,C,C1,Foreign Service Officer,10+ years,MORTGAGE,128000.0,Source Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,home_improvement,Home improvement,200xx,DC,6.46,0.0,Feb-90,0.0,46.0,,17.0,0.0,14277,27.4,46.0,w,35000.0,35000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,56.0,1,INDIVIDUAL,,,,0.0,321.0,146867.0,1.0,11.0,0.0,0.0,28.0,35367.0,49.3,0.0,1.0,5020.0,40.1,52200.0,1.0,4.0,0.0
1,68547583,73437441,8650,8650,8650.0,36 months,5.32,260.5,A,A1,Associate Consultant,< 1 year,MORTGAGE,100000.0,Not Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,credit_card,Credit card refinancing,462xx,IN,7.28,0.0,Jul-01,0.0,,,15.0,0.0,7158,26.7,24.0,w,8650.0,8650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,,1,INDIVIDUAL,,,,0.0,0.0,165450.0,0.0,1.0,1.0,1.0,11.0,24041.0,88.8,0.0,3.0,3081.0,57.9,26800.0,1.0,0.0,5.0
2,67849662,72708407,4225,4225,4225.0,36 months,14.85,146.16,C,C5,mechanic,5 years,RENT,35000.0,Source Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,debt_consolidation,Debt consolidation,672xx,KS,15.22,2.0,Jul-11,0.0,18.0,,6.0,0.0,1058,24.6,6.0,w,4225.0,4225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,,1,INDIVIDUAL,,,,0.0,0.0,4888.0,0.0,1.0,0.0,0.0,47.0,3830.0,21.9,0.0,0.0,367.0,22.4,4300.0,0.0,0.0,0.0
3,68506885,73396712,10000,10000,10000.0,60 months,11.99,222.4,C,C1,CARDIOVASCULAR TECH,10+ years,RENT,42500.0,Not Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,debt_consolidation,Debt consolidation,460xx,IN,31.04,0.0,Dec-98,1.0,,,10.0,0.0,5812,40.9,23.0,w,10000.0,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,,1,INDIVIDUAL,,,,0.0,0.0,41166.0,1.0,3.0,0.0,2.0,13.0,35354.0,75.5,1.0,1.0,3118.0,67.4,14200.0,1.0,1.0,1.0
4,68341763,72928789,20000,20000,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,home_improvement,,605xx,IL,10.78,0.0,Aug-00,0.0,,,6.0,0.0,7869,56.2,18.0,w,20000.0,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Dec-15,0.0,,1,JOINT,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,72.8,0.0,2.0,2081.0,64.7,14000.0,2.0,5.0,1.0


### Run PD Model

In [7]:
# process the new data for PD Model
clean_data = process_raw(data,pd_enc)
clean_data.sample(3)

Unnamed: 0,id,member_id,Grade:A,Grade:B,Grade:C,Grade:D,Grade:E,Grade:F,Grade:G,Status:Not_Verified,Status:Source_Verified,Status:Verified,Loan:above_281,Loan:below_74,Loan:bet_143_212,Loan:bet_212_281,Loan:bet_74_143,Term:36,Term:60,DTI:above_32,DTI:below_8,DTI:bet_16_24,DTI:bet_24_32,DTI:bet_8_16,Purpose:cat_1,Purpose:cat_2,Purpose:cat_3,Purpose:cat_4,Purpose:cat_5,Purpose:small_business
31948,66084769,70779494,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
384683,40400227,43264940,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9007,67829056,72687800,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
## create result data frame
result = data[['id','member_id','funded_amnt']].copy()

In [9]:
# credit score for train data
result['credit_score'] = get_credit_score(clean_data)

In [10]:
## get PD
drop_col = ['id','member_id','Grade:G','Status:Verified','Loan:below_74','Term:60','DTI:below_8','Purpose:small_business']
predict = pd_model.predict_proba(clean_data.drop(drop_col,axis='columns'))
result['Default_Prob'] = predict[:,0]
result['Good_Prob'] = predict[:,1]

### Run LGD Model

In [11]:
clean_data = prepare_data(data,lgd_ead_enc,lgd_ead_scaler)
clean_data.head(3)

Unnamed: 0,id,member_id,term,int_rate,emp_length,dti,delinq_2yrs,inq_last_6mths,open_acc,total_rec_late_fee,credit_hist,per_pymnt,per_rec_prncp,per_installment,per_annual_inc,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Source_Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_w
0,68444620,73334399,1.0,0.318314,1.0,0.161581,0.0,0.0,0.223684,0.0,0.442877,0.0,0.0,0.142428,0.012799,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,68547583,73437441,0.0,-0.004845,0.0,0.182091,0.0,0.0,0.197368,0.0,0.246827,0.0,0.0,0.515946,0.049395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,67849662,72708407,0.0,0.45688,0.5,0.38069,0.117647,0.0,0.078947,0.0,0.074753,0.0,0.0,0.728332,0.034223,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
# select features for EAD Model
feature = ['per_pymnt','per_rec_prncp','grade_B','grade_C','grade_D','grade_E','grade_F','grade_G','initial_list_status_w']
X_data = clean_data[feature]

# add intercept
X_data  = sm.add_constant(X_data) 

# get recovery rate prediction
result['recovery_rate'] = lgd_model.predict(X_data)

### Run EAD Model

In [13]:
# select features for EAD Model
feature = ['per_pymnt']
X_data = clean_data[feature]

# add intercept
X_data  = sm.add_constant(X_data) 

# get recovery rate prediction
result['CCF'] = ead_model.predict(X_data)

In [14]:
result.sample(3)

Unnamed: 0,id,member_id,funded_amnt,credit_score,Default_Prob,Good_Prob,recovery_rate,CCF
197074,55534189,59135926,15000,664.0,0.285265,0.714735,0.043402,0.930208
254773,51408120,54827885,21450,573.0,0.437507,0.562493,0.000746,0.285644
113935,61482575,65601406,24100,562.0,0.456215,0.543785,0.000568,0.353245


### Computing Expected Loss

In [15]:
result['Expected_Loss'] = result['Default_Prob'] * (1-result['recovery_rate']) * (result['CCF']* result['funded_amnt'])

In [16]:
result['Expected_Loss'].describe()

count    421094.000000
mean       5057.830098
std        4123.245796
min           1.065378
25%        1876.801515
50%        3778.740887
75%        7186.388030
max       26020.758761
Name: Expected_Loss, dtype: float64

In [17]:
result.sample(3)

Unnamed: 0,id,member_id,funded_amnt,credit_score,Default_Prob,Good_Prob,recovery_rate,CCF,Expected_Loss
276356,48555043,51803787,6000,578.0,0.427369,0.572631,0.025815,0.88807,2218.41831
349277,42964796,45961538,25000,594.0,0.399072,0.600928,0.020755,0.836395,8171.351438
92824,63434712,67777439,9600,719.0,0.209661,0.790339,0.040647,0.938942,1813.030635


In [18]:
result['Expected_Loss'].sum()

2129821907.0847073

In [19]:
result['funded_amnt'].sum()

6417580175

In [20]:
result['Expected_Loss'].sum()/result['funded_amnt'].sum()

0.3318730501227758