In [1]:
import pandas as pd
import warnings
import pickle
import numpy as np
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100

In [2]:
# Load one hot encoder
enc = pickle.load(open( "PD_One_hot.pkl", "rb" ))

### Preprocess routine

In [3]:
# function for converting dtito category
def get_dti_cat(dti):
    if dti < 8:
        cat = 'below_8'
    elif dti >= 8 and dti < 16:
        cat = 'bet_8_16'
    elif dti >= 16 and dti < 24 :
        cat = 'bet_16_24'
    elif dti >= 24 and dti < 32:
        cat = 'bet_24_32'
    else :
        cat = 'above_32'
    return cat

# function for converting loan amount to category
def get_loan_cat(amount):
    if amount < 7400:
        cat = 'below_74'
    elif amount >= 7400 and amount < 14300:
        cat = 'bet_74_143'
    elif amount >= 14300 and amount < 21200 :
        cat = 'bet_143_212'
    elif amount >= 21200 and amount < 28100:
        cat = 'bet_212_281'
    else :
        cat = 'above_281'
    return cat

def process_raw(raw,enc):
    # select feature
    imp_feature = ['id','member_id','grade','verification_status','loan_amnt','term','dti','purpose']
    df = raw[imp_feature].copy()
    
    # Coarse Classing Purpose Variable
    purpose_coarse_classing = {'moving': 'cat_1','other':'cat_1','renewable_energy':'cat_1',
                               'house':'cat_2','vacation':'cat_2','educational':'cat_2',
                               'credit_card':'cat_3','home_improvement':'cat_3',
                               'major_purchase':'cat_4','wedding':'cat_4','car':'cat_4',
                               'medical':'cat_5','debt_consolidation':'cat_5'}

    df['purpose'].replace(purpose_coarse_classing,inplace=True)
    
    # Discretize Loan amount
    df['loan_amnt'] = df['loan_amnt'].apply(get_loan_cat)
    df['loan_amnt'] = df['loan_amnt'].astype(str)
    
    # Discretize dti amount
    df['dti'] = df['dti'].apply(get_dti_cat)
    df['dti'] = df['dti'].astype(str)
    
    # converting term from str to int
    df['term'] = df['term'].apply(lambda x : x.split()[0])
    df['term'] = df['term'].astype(int)
    
    # one hot category columns
    cat_columns = ['grade','verification_status','loan_amnt','term','dti','purpose']
    one_hot_vec = enc.transform(df[cat_columns])
    one_hot_vec = one_hot_vec.toarray()
    
    one_hot_df = pd.DataFrame(one_hot_vec,columns=enc.get_feature_names(),index=df.index)
    
    # converting columns name to more meaning full names
    one_hot_df.columns = one_hot_df.columns.str.replace('x0_','Grade:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x1_','Status:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x2_','Loan:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x3_','Term:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x4_','DTI:')
    one_hot_df.columns = one_hot_df.columns.str.replace('x5_','Purpose:')
    one_hot_df.columns = one_hot_df.columns.str.replace(' ','_')
    
    # drop cateory columns
    df.drop(cat_columns,axis='columns',inplace=True)
    
    # concat dataframes
    df = pd.concat([df,one_hot_df],axis=1,sort=False)
    
    return df

In [4]:
# load new data
new_data = pd.read_csv('loan_data_2015.csv')
new_data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,68444620,73334399,35000,35000,35000.0,60 months,11.99,778.38,C,C1,Foreign Service Officer,10+ years,MORTGAGE,128000.0,Source Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,home_improvement,Home improvement,200xx,DC,6.46,0.0,Feb-90,0.0,46.0,,17.0,0.0,14277,27.4,46.0,w,35000.0,35000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,56.0,1,INDIVIDUAL,,,,0.0,321.0,146867.0,1.0,11.0,0.0,0.0,28.0,35367.0,49.3,0.0,1.0,5020.0,40.1,52200.0,1.0,4.0,0.0
1,68547583,73437441,8650,8650,8650.0,36 months,5.32,260.5,A,A1,Associate Consultant,< 1 year,MORTGAGE,100000.0,Not Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,credit_card,Credit card refinancing,462xx,IN,7.28,0.0,Jul-01,0.0,,,15.0,0.0,7158,26.7,24.0,w,8650.0,8650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,,1,INDIVIDUAL,,,,0.0,0.0,165450.0,0.0,1.0,1.0,1.0,11.0,24041.0,88.8,0.0,3.0,3081.0,57.9,26800.0,1.0,0.0,5.0
2,67849662,72708407,4225,4225,4225.0,36 months,14.85,146.16,C,C5,mechanic,5 years,RENT,35000.0,Source Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,debt_consolidation,Debt consolidation,672xx,KS,15.22,2.0,Jul-11,0.0,18.0,,6.0,0.0,1058,24.6,6.0,w,4225.0,4225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,,1,INDIVIDUAL,,,,0.0,0.0,4888.0,0.0,1.0,0.0,0.0,47.0,3830.0,21.9,0.0,0.0,367.0,22.4,4300.0,0.0,0.0,0.0
3,68506885,73396712,10000,10000,10000.0,60 months,11.99,222.4,C,C1,CARDIOVASCULAR TECH,10+ years,RENT,42500.0,Not Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,debt_consolidation,Debt consolidation,460xx,IN,31.04,0.0,Dec-98,1.0,,,10.0,0.0,5812,40.9,23.0,w,10000.0,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Jan-16,0.0,,1,INDIVIDUAL,,,,0.0,0.0,41166.0,1.0,3.0,0.0,2.0,13.0,35354.0,75.5,1.0,1.0,3118.0,67.4,14200.0,1.0,1.0,1.0
4,68341763,72928789,20000,20000,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-15,Issued,n,https://www.lendingclub.com/browse/loanDetail....,,home_improvement,,605xx,IL,10.78,0.0,Aug-00,0.0,,,6.0,0.0,7869,56.2,18.0,w,20000.0,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Jan-16,Dec-15,0.0,,1,JOINT,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,72.8,0.0,2.0,2081.0,64.7,14000.0,2.0,5.0,1.0


In [5]:
# process the new data
clean_data = process_raw(new_data,enc)
clean_data.sample(3)

Unnamed: 0,id,member_id,Grade:A,Grade:B,Grade:C,Grade:D,Grade:E,Grade:F,Grade:G,Status:Not_Verified,Status:Source_Verified,Status:Verified,Loan:above_281,Loan:below_74,Loan:bet_143_212,Loan:bet_212_281,Loan:bet_74_143,Term:36,Term:60,DTI:above_32,DTI:below_8,DTI:bet_16_24,DTI:bet_24_32,DTI:bet_8_16,Purpose:cat_1,Purpose:cat_2,Purpose:cat_3,Purpose:cat_4,Purpose:cat_5,Purpose:small_business
122464,60586639,64607398,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
336288,43529900,46556645,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
70700,63897435,68328157,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# load train data
train_data = pd.read_csv('Processed_Train.csv')
train_data.sample(3)

Unnamed: 0,id,member_id,Good_loan,Grade:A,Grade:B,Grade:C,Grade:D,Grade:E,Grade:F,Grade:G,Status:Not_Verified,Status:Source_Verified,Status:Verified,Loan:above_281,Loan:below_74,Loan:bet_143_212,Loan:bet_212_281,Loan:bet_74_143,Term:36,Term:60,DTI:above_32,DTI:below_8,DTI:bet_16_24,DTI:bet_24_32,DTI:bet_8_16,Purpose:cat_1,Purpose:cat_2,Purpose:cat_3,Purpose:cat_4,Purpose:cat_5,Purpose:small_business
59822,20718887,22991658,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
97901,662601,847339,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
92740,19666924,21889670,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [7]:
def get_credit_score(df,score_card_path='score.csv'):
    
    #copy the df
    df = df.copy()
    
    #adding intercept
    df['Intercept'] = 1 

    # load scorecard
    score = pd.read_csv(score_card_path,index_col=0)
    
    # list of features used in the credit score calculation
    feature = score.index
    
    # Computing credit score with using dot product
    credit_score = df[feature].dot(score)
    
    return credit_score    

In [8]:
# credit score for train data
train_data['credit_score'] = get_credit_score(train_data)
train_data.sample(3)

Unnamed: 0,id,member_id,Good_loan,Grade:A,Grade:B,Grade:C,Grade:D,Grade:E,Grade:F,Grade:G,Status:Not_Verified,Status:Source_Verified,Status:Verified,Loan:above_281,Loan:below_74,Loan:bet_143_212,Loan:bet_212_281,Loan:bet_74_143,Term:36,Term:60,DTI:above_32,DTI:below_8,DTI:bet_16_24,DTI:bet_24_32,DTI:bet_8_16,Purpose:cat_1,Purpose:cat_2,Purpose:cat_3,Purpose:cat_4,Purpose:cat_5,Purpose:small_business,credit_score
150075,1515187,1777389,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,599.0
18068,6699361,4582358,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,630.0
69016,593040,761591,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,562.0


In [9]:
# credit score for new data
clean_data['credit_score'] = get_credit_score(clean_data)
clean_data.sample(3)

Unnamed: 0,id,member_id,Grade:A,Grade:B,Grade:C,Grade:D,Grade:E,Grade:F,Grade:G,Status:Not_Verified,Status:Source_Verified,Status:Verified,Loan:above_281,Loan:below_74,Loan:bet_143_212,Loan:bet_212_281,Loan:bet_74_143,Term:36,Term:60,DTI:above_32,DTI:below_8,DTI:bet_16_24,DTI:bet_24_32,DTI:bet_8_16,Purpose:cat_1,Purpose:cat_2,Purpose:cat_3,Purpose:cat_4,Purpose:cat_5,Purpose:small_business,credit_score
19355,67266201,72067930,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,663.0
287420,48694772,51943507,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,807.0
52128,65611220,70233940,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,566.0


### Population stability index

In [10]:
#  drop columns not important for population stability index
drop_col = ['id','member_id','credit_score']
clean_data.drop(drop_col,axis='columns',inplace=True)
drop_col = ['id','member_id','Good_loan','credit_score']
train_data.drop(drop_col,axis='columns',inplace=True)

In [11]:
# compute proportion
new_prop = clean_data.sum()/clean_data.shape[0]
# compute proportion
train_prop = train_data.sum()/train_data.shape[0]

In [12]:
prop = pd.concat([train_prop,new_prop],axis=1)
prop = prop.reset_index()
prop.columns = ['Feature','Train','New']
prop['Original_Feature'] = prop['Feature'].str.split(':').str[0]
prop

Unnamed: 0,Feature,Train,New,Original_Feature
0,Grade:A,0.172019,0.174154,Grade
1,Grade:B,0.309197,0.279287,Grade
2,Grade:C,0.253132,0.286318,Grade
3,Grade:D,0.157687,0.148789,Grade
4,Grade:E,0.071623,0.082993,Grade
5,Grade:F,0.029196,0.023313,Grade
6,Grade:G,0.007146,0.005146,Grade
7,Status:Not_Verified,0.347965,0.281441,Status
8,Status:Source_Verified,0.278328,0.426425,Status
9,Status:Verified,0.373706,0.292134,Status


In [13]:
prop['PSI'] = (prop['New'] - prop['Train']) * np.log(prop['New']/prop['Train'])
psi = prop.groupby('Original_Feature')['PSI'].sum()
psi

Original_Feature
DTI        0.131959
Grade      0.011331
Loan       0.042927
Purpose    0.037539
Status     0.097386
Term       0.065915
Name: PSI, dtype: float64

In [14]:
# psi < 0.1 no change
psi < 0.1

Original_Feature
DTI        False
Grade       True
Loan        True
Purpose     True
Status      True
Term        True
Name: PSI, dtype: bool

All variables PSI is less than 0.1 means no change in new data population except DTI

In [15]:
psi['DTI']

0.1319590127625515

PSI >=0.1 but less than 0.25 - Slight change is required.