In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max.rows', 1000)

data = pd.read_csv('./data/Loan_payments_data.csv')

data['paid_off_time'] = pd.to_datetime(data['paid_off_time']) 
data['effective_date'] = pd.to_datetime(data['effective_date']) 
data['due_date'] = pd.to_datetime(data['due_date'])

fromdate_paid = min(data['paid_off_time'])
fromdate_effe = min(data['effective_date'])
fromdate_due = min(data['due_date'])

train, test = train_test_split(data, test_size=0.3, random_state=1000)

X_train = train.drop(['loan_status'], axis=1)
y_train = train[['Loan_ID','loan_status']]
X_test = test.drop(['loan_status'], axis=1)
y_test = test[['Loan_ID','loan_status']]

X_TEST = X_test.drop(['Loan_ID', 'past_due_days'], axis=1)

X = X_train.drop(['Loan_ID','past_due_days'], axis=1)
Y = y_train['loan_status'].map(lambda x: 1 if x == 'PAIDOFF' else 0)

X_TEST['paid_off_time'] = X_TEST['paid_off_time'].map(lambda x: 1 if x != None else x)
X['paid_off_time'] = X['paid_off_time'].map(lambda x: 1 if x != None else 0)

X_TEST['effective_date'] = (pd.to_datetime(X_TEST['effective_date']) - fromdate_effe).dt.days.astype(int)
X_TEST['due_date'] = (pd.to_datetime(X_TEST['due_date']) - fromdate_due).dt.days.astype(int)

X['effective_date'] = (pd.to_datetime(X['effective_date']) - fromdate_effe).dt.days.astype(int)
X['due_date'] = (pd.to_datetime(X['due_date']) - fromdate_due).dt.days.astype(int)

X_TEST['Gender'] = LabelEncoder().fit_transform(X_TEST['Gender'])
X_TEST['education'] = LabelEncoder().fit_transform(X_TEST['education'])

X['Gender'] = LabelEncoder().fit_transform(X['Gender'])
X['education'] = LabelEncoder().fit_transform(X['education'])

X_TRAIN ,X_VALID, Y_TRAIN, Y_VALID = train_test_split(X, Y, test_size=0.33, random_state=42)

rf = RandomForestClassifier(random_state=42, max_depth=42).fit(X_TRAIN, Y_TRAIN)
y_pred = rf.predict(X_VALID)
print('RF :', roc_auc_score(Y_VALID, y_pred))

gb = GradientBoostingClassifier(random_state=42, max_depth=100).fit(X_TRAIN, Y_TRAIN)
y_pred = gb.predict(X_VALID)
print('GB :', roc_auc_score(Y_VALID, y_pred))

svr = SVR(C=2).fit(X_TRAIN, Y_TRAIN)
y_pred = svr.predict(X_VALID)
print('SVM :', roc_auc_score(Y_VALID, y_pred))

test_pred = rf.predict(X_TEST)
result = pd.DataFrame({'Loan_ID':test['Loan_ID'], 'loan_status':test_pred})
result['loan_status'] = result['loan_status'].map(lambda x: 'PAIDOFF' if x == 1 else 'COLLECTION_PAIDOFF')
result.to_csv('000000.csv', index=False)

RF : 0.6028169014084508
GB : 0.6057902973395931
SVM : 0.6474178403755869


In [2]:
Y_TEST = y_test['loan_status'].map(lambda x: 1 if x == 'PAIDOFF' else 0)
print('RF :', roc_auc_score(Y_TEST, test_pred))
# print(classification_report(Y_TEST, test_pred))
# print(confusion_matrix(Y_TEST, test_pred))

RF : 0.6317750046563605


In [3]:
pd.concat([y_test, result], axis=1)

Unnamed: 0,Loan_ID,loan_status,Loan_ID.1,loan_status.1
319,xqd20160320,COLLECTION,xqd20160320,COLLECTION_PAIDOFF
207,xqd20160208,PAIDOFF,xqd20160208,PAIDOFF
22,xqd20160023,PAIDOFF,xqd20160023,COLLECTION_PAIDOFF
420,xqd20160421,COLLECTION_PAIDOFF,xqd20160421,PAIDOFF
352,xqd20160353,COLLECTION,xqd20160353,PAIDOFF
491,xqd20160492,COLLECTION_PAIDOFF,xqd20160492,PAIDOFF
60,xqd20160061,PAIDOFF,xqd20160061,COLLECTION_PAIDOFF
264,xqd20160265,PAIDOFF,xqd20160265,PAIDOFF
454,xqd20160455,COLLECTION_PAIDOFF,xqd20160455,PAIDOFF
21,xqd20160022,PAIDOFF,xqd20160022,PAIDOFF
