In [72]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

import functools

In [73]:
data = pd.read_csv('input_file_2.csv', sep = ',', index_col=0)

In [74]:
data['issue_d'] = pd.to_datetime(data['issue_d'])

In [75]:
'''Exclude the period from the beginning of 2016 onward as it is noticed in time-series plots that this period has
 a significant portion of the expected defaults not reported yet. This would hence be biased.'''

data = data.loc[data['issue_d'] < '2016-1-1']

In [76]:
all_cols = list(data.columns)

print('to scale     ', all_cols)

print(len(all_cols))

all_cols.remove('charged_off')
all_cols.remove('issue_d')

to scale      ['loan_amnt', 'term', 'installment', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'dti', 'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_util', 'total_acc', 'application_type', 'mort_acc', 'pub_rec_bankruptcies', 'log_annual_inc', 'fico_score', 'log_revol_bal', 'charged_off']
21


In [77]:
to_drop_categorical = ['home_ownership', 'verification_status', 'purpose', 'application_type']

In [78]:
for i in to_drop_categorical:

    all_cols.remove(i)

print('to scale     ', all_cols)

print(len(all_cols))


to scale      ['loan_amnt', 'term', 'installment', 'emp_length', 'dti', 'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_util', 'total_acc', 'mort_acc', 'pub_rec_bankruptcies', 'log_annual_inc', 'fico_score', 'log_revol_bal']
15


In [79]:
data = pd.get_dummies(data,prefix=["ho","vs","purp","at"], columns=['home_ownership', 'verification_status', 'purpose', 'application_type'], drop_first=False)

In [80]:
train_df = data.loc[data['issue_d'] < data['issue_d'].quantile(0.90)]
test_df = data.loc[data['issue_d'] >= data['issue_d'].quantile(0.90)]

In [81]:
train_df.drop('issue_d', axis=1, inplace=True)
test_df.drop('issue_d', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [82]:
train_df[all_cols] = train_df[all_cols].fillna(train_df[all_cols].mean())
test_df[all_cols] = test_df[all_cols].fillna(train_df[all_cols].mean())
print('null values      ', train_df.isnull().sum())
print(type(train_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


null values       loan_amnt                  0
term                       0
installment                0
emp_length                 0
dti                        0
earliest_cr_line           0
open_acc                   0
pub_rec                    0
revol_util                 0
total_acc                  0
mort_acc                   0
pub_rec_bankruptcies       0
log_annual_inc             0
fico_score                 0
log_revol_bal              0
charged_off                0
ho_MORTGAGE                0
ho_OWN                     0
ho_RENT                    0
vs_Not Verified            0
vs_Source Verified         0
vs_Verified                0
purp_car                   0
purp_credit_card           0
purp_debt_consolidation    0
purp_educational           0
purp_home_improvement      0
purp_house                 0
purp_major_purchase        0
purp_medical               0
purp_moving                0
purp_other                 0
purp_renewable_energy      0
purp_small_business      

In [83]:
scaler = StandardScaler(copy=False)

train_df[all_cols] = scaler.fit_transform(train_df[all_cols], train_df['charged_off'])
test_df[all_cols] = scaler.transform(test_df[all_cols])

print(scaler.mean_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[all_cols] = scaler.fit_transform(train_df[all_cols], train_df['charged_off'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


[1.41525804e+04 4.16541917e+01 4.32144701e+02 5.95372688e+00
 1.75190144e+01 1.99762571e+03 1.13329293e+01 1.83033432e-01
 5.49551971e+01 2.52778843e+01 1.79007251e+00 1.19857837e-01
 4.80227778e+00 6.97463073e+02 4.00522200e+00]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[all_cols] = scaler.transform(test_df[all_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [84]:
y_train = train_df['charged_off']
y_test = test_df['charged_off']

X_train = train_df.drop('charged_off', axis=1)
X_test = test_df.drop('charged_off', axis=1)

In [85]:
# def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
#   label = df[label_key]
#   ds = tf.data.Dataset.from_tensor_slices((dict(df),label))

#   if shuffle:
#     ds = ds.shuffle(10000)

#   ds = ds.batch(batch_size).repeat(num_epochs)

#   return ds

In [86]:
# train_inpf = functools.partial(easy_input_function, train_df, label_key='charged_off',  num_epochs=5, shuffle=True, batch_size=20000)#300000 #230934
# test_inpf = functools.partial(easy_input_function, test_df, label_key='charged_off', num_epochs=1, shuffle=False, batch_size=200000) #200000

In [87]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(train_df.shape[1] - 1, input_shape=(train_df.shape[1] - 1,), activation='tanh'))
model.add(Dense(5, activation='tanh'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [88]:
model.fit(X_train, y_train,epochs=5, batch_size=20000, verbose=1, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f501148160>