In [1]:
import pandas as pd

Data from Kaggel lendnig club loan data:<br>
https://www.kaggle.com/wendykan/lending-club-loan-data#

In [2]:
loan_data = pd.read_csv('loan.csv', low_memory=False)

In [3]:
loan_data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,


In [4]:
loan_data.shape

(2260668, 145)

## Simple cleaning

In [5]:
loan_data_clean = loan_data.loc[loan_data['term'] == ' 60 months']

In [6]:
loan_data_clean = loan_data_clean.loc[ ~loan_data_clean['loan_status'].isin(['Current', 'In Grace Period']) ]

In [7]:
loan_data_clean = loan_data_clean.loc[(loan_data_clean['annual_inc'] >= 1000.0) & (loan_data_clean['annual_inc'] <= 1000000.0)]

In [8]:
loan_data_clean = loan_data_clean.loc[loan_data_clean['home_ownership'] != 'ANY']

In [9]:
loan_data_clean = loan_data_clean[['loan_amnt', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership',
                                     'annual_inc', 'purpose', 'inq_last_12m', 'delinq_2yrs', 'loan_status']]

In [10]:
loan_data_clean = loan_data_clean.dropna()
loan_data_clean = loan_data_clean.reset_index(drop = True)

## Prepare date for NN

In [11]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

### Labels encoding

In [12]:
loan_data_clean_prepared = loan_data_clean.copy()

In [13]:
grade_encoder = preprocessing.LabelEncoder()

In [14]:
grade_encoder.fit(loan_data_clean['grade'].unique())

LabelEncoder()

In [15]:
loan_data_clean_prepared['grade'] = grade_encoder.transform(loan_data_clean['grade'])

In [16]:
ownership_encoder = preprocessing.LabelEncoder()

In [17]:
ownership_encoder.fit(loan_data_clean['home_ownership'].unique())

LabelEncoder()

In [18]:
loan_data_clean_prepared['home_ownership'] = ownership_encoder.transform(loan_data_clean['home_ownership'])

In [19]:
purpose_encoder = preprocessing.LabelEncoder()

In [20]:
purpose_encoder.fit(loan_data_clean['purpose'].unique())

LabelEncoder()

In [21]:
loan_data_clean_prepared['purpose'] = purpose_encoder.transform(loan_data_clean['purpose'])

In [22]:
loan_data_clean_prepared['emp_length'] = [0.0 if emp == '< 1 year' else 
                                              10.0 if emp == '10+ years' else float(emp.split(' ')[0]) 
                                                  for emp in loan_data_clean['emp_length']]

In [23]:
loan_data_clean_prepared['loan_status'] = [0 if (stat == 'Default' or stat == 'Charged Off') else 1
                                               for stat in loan_data_clean_prepared['loan_status']]

In [24]:
loan_data_clean_prepared.head()

Unnamed: 0,loan_amnt,int_rate,installment,grade,emp_length,home_ownership,annual_inc,purpose,inq_last_12m,delinq_2yrs,loan_status
0,40000,16.14,975.71,2,0.0,0,45000.0,1,4.0,0.0,1
1,20000,17.97,507.55,3,4.0,2,57000.0,2,0.0,0.0,1
2,10450,12.98,237.67,1,10.0,0,58000.0,1,5.0,0.0,1
3,20000,26.31,602.49,4,4.0,0,58000.0,1,2.0,0.0,1
4,10000,12.98,227.43,1,1.0,0,60000.0,0,1.0,1.0,1


### Normalization

In [25]:
loan_data_clean_prepared_X = loan_data_clean_prepared.iloc[:, :-1]

In [26]:
loan_data_clean_prepared_Y = loan_data_clean_prepared.iloc[:, -1]

In [27]:
scaler = MinMaxScaler()

In [28]:
scaler.fit(loan_data_clean_prepared_X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [29]:
loan_data_clean_prepared_X = scaler.transform(loan_data_clean_prepared_X)

In [30]:
loan_data_clean_prepared_Y = loan_data_clean_prepared_Y.values

## NN

In [40]:
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.models import model_from_json
from keras import backend

In [53]:
backend.clear_session()

In [54]:
model = Sequential()
#First Layer
model.add(Dense(10, activation='relu', kernel_initializer='random_normal', input_dim=10))
#First Hidden Layer
model.add(Dense(50, activation='tanh', kernel_initializer='random_normal'))
#Second Hidden Layer
model.add(Dense(100, activation='tanh', kernel_initializer='random_normal'))
model.add(Dropout(0.2))
#Third Hidden Layer
model.add(Dense(20, activation='tanh', kernel_initializer='random_normal'))
#Output Layer
model.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 50)                550       
_________________________________________________________________
dense_3 (Dense)              (None, 100)               5100      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 20)                2020      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 21        
Total params: 7,801
Trainable params: 7,801
Non-trainable params: 0
____________________________________________________

In [55]:
#Compile the neural network
model.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [34]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(loan_data_clean_prepared_X, 
                                                    loan_data_clean_prepared_Y, 
                                                    test_size=0.33, random_state=15)

In [57]:
#Train the network
model.fit(X_train, y_train, batch_size=100, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x232a1c9cd30>

In [58]:
#Evaluate test data
score = model.evaluate(X_test, y_test)
print(score)


[0.5902180095166382, 0.695036768913269]


### Save modelm scaler and encoders

In [60]:
#Serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
#Serialize weights to HDF5
model.save_weights("model.h5")

In [59]:
from pickle import dump

In [62]:
dump(scaler, open('scaler.pkl', 'wb'))
dump(grade_encoder, open('grade_encoder.pkl', 'wb'))
dump(ownership_encoder, open('ownership_encoder.pkl', 'wb'))
dump(purpose_encoder, open('purpose_encoder.pkl', 'wb'))