# Credit Risk Model Python
Using Lending Club Data to assess best machine learning models for credit worthiness.
Note: This was a quick script session, a real model would need better generalization within cells for functions, classes, etc...

In [1]:
import pandas as pd
# Read in first 10,000 observations, needed to skip first row.
df = pd.read_csv('/Users/kevin/Desktop/LoanStats3a.csv', skiprows=1, nrows=10000)
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,,,Cash,N,,,,,,
1,,,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,,,Cash,N,,,,,,
2,,,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,,,Cash,N,,,,,,
3,,,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,,Cash,N,,,,,,
4,,,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,,,Cash,N,,,,,,


In [2]:
# Column Names (df.columns wouldn't print all at once)
# for i in df.columns:
#     print(i)

# Predictor Variable
df['loan_status'].head()

0     Fully Paid
1    Charged Off
2     Fully Paid
3     Fully Paid
4     Fully Paid
Name: loan_status, dtype: object

In [3]:
# Subsetting Dataframe to only visually important columns for logistic regression
# Will test PCA and NMF for dimension reduction later

df = df[['loan_amnt','funded_amnt','funded_amnt_inv','term','int_rate','installment','grade','sub_grade','loan_status']]
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,loan_status
0,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,Fully Paid
1,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,Charged Off
2,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,Fully Paid
3,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,Fully Paid
4,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,Fully Paid


### Clean and encode variables

In [4]:
# Creating new_term to be numeric version of term
df['new_term'] = df['term'].str.strip(' months')
df['new_term'] = pd.to_numeric(df['new_term'])
print(type(df['new_term'][0]))

# Creating new_int_rate
df['new_int_rate'] = df['int_rate'].str.strip('%')
df['new_int_rate'] = pd.to_numeric(df['new_int_rate'])
print(type(df['new_int_rate'][0]))

# Convert to categorical variables
df['new_grade'] = df['grade'].astype('category')
df['new_sub_grade'] = df['sub_grade'].astype('category')

df.head()

<class 'numpy.int64'>
<class 'numpy.float64'>


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,loan_status,new_term,new_int_rate,new_grade,new_sub_grade
0,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,Fully Paid,36,10.65,B,B2
1,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,Charged Off,60,15.27,C,C4
2,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,Fully Paid,36,15.96,C,C5
3,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,Fully Paid,36,13.49,C,C1
4,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,Fully Paid,60,12.69,B,B5


In [5]:
# Dropping Unnecessary duplicate columns
df = df.drop(['term','int_rate','grade','sub_grade'], axis=1)

In [6]:
# Rearranging columns with loan_status at the end since it's the predictor variable
df = df[['loan_amnt','funded_amnt','funded_amnt_inv','installment','new_term',
         'new_grade','new_sub_grade','new_int_rate','loan_status']]
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,new_term,new_grade,new_sub_grade,new_int_rate,loan_status
0,5000,5000,4975.0,162.87,36,B,B2,10.65,Fully Paid
1,2500,2500,2500.0,59.83,60,C,C4,15.27,Charged Off
2,2400,2400,2400.0,84.33,36,C,C5,15.96,Fully Paid
3,10000,10000,10000.0,339.31,36,C,C1,13.49,Fully Paid
4,3000,3000,3000.0,67.79,60,B,B5,12.69,Fully Paid


### LabelEncode Features before Logistic Regression

In [9]:
from sklearn.preprocessing import LabelEncoder

# One Hot Encode new_grade and new_sub_grade
le = LabelEncoder()
df['new_grade'] = le.fit_transform(df['new_grade'])
df['new_sub_grade'] = le.fit_transform(df['new_sub_grade'])
df['loan_status'] = le.fit_transform(df['loan_status'])

df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,new_term,new_grade,new_sub_grade,new_int_rate,loan_status
0,5000,5000,4975.0,162.87,36,1,6,10.65,1
1,2500,2500,2500.0,59.83,60,2,13,15.27,0
2,2400,2400,2400.0,84.33,36,2,14,15.96,1
3,10000,10000,10000.0,339.31,36,2,10,13.49,1
4,3000,3000,3000.0,67.79,60,1,9,12.69,1


### Basic Logistic Regression for comparison purposes

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Train/Test Split
X = df.iloc[:,:-1] # Independent Matrix (input)
y = df.iloc[:,-1] # Dependent Vector (output)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=7)
#X_train.head()

# Fit
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Results
print('Accuracy is: {}'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy is: 0.846
             precision    recall  f1-score   support

          0       0.33      0.00      0.00       461
          1       0.85      1.00      0.92      2539

avg / total       0.77      0.85      0.78      3000

[[   1  460]
 [   2 2537]]


### Conclusion for Base Logistic Regression
In summary: it's trash. Why? Well although the accuracy is good, the precision and recall are abundantly bad, which is what we truly care about. Our logistic Regression basically only started predicting a ton of 1's (fully paid) and didn't get almost any zeros right at all. As this is the baseline model this is what we'd expect.
In the classification report above, sklearn denotes the specificity to be the negative predictors recall (Where zero and recall connect). We need the specificity to be as high as possible and 0 percent is as low as possible. On the bright side the only way to go is up!

### Potential Steps for Model Improvement
1. PCA/NMF all features and let unsupervised learning dictate relevent features before baseline model.
2. Logistic Regression Threshold set lower (rather than 50/50, do 15/85 as the decision boundary)
3. Completely different model: Decision Tree, Random Forest, Neural Net.