In [3]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [4]:
#data = pd.read_csv('complete_dataset.txt', sep='\t')
data = pd.read_csv('lc_test.txt', sep='\t')

In [5]:
data = data[data.emp_length != 'na']

In [6]:
data['emp_length'].replace(' 1 year', 1, inplace = True)
data['emp_length'].replace('1 year', 1, inplace = True)

In [7]:
X = data[data.columns.difference(['loan_status'])].values
y = data['loan_status'].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [10]:
#logistic regression
lr = LogisticRegression()
model = lr.fit(X_train, y_train)



In [11]:
pred = model.predict(X_test)

In [12]:
# Accuracy
testlen = X_test.shape[0]
print(sum([pred[i] == y_test[i] for i in range(testlen)]) / float(testlen))

# Specificity: For those who didn't default, how many did it predict correctly?
print(sum([pred[i] == y_test[i] and pred[i] == 0 for i in range(testlen)]) / float(sum([pred[i] == 0 for i in range(testlen)])))

# Sensitivity: For those who did default, how many did it predict correctly?
print(sum([pred[i] == y_test[i] and pred[i] == 1 for i in range(testlen)]) / float(sum([pred[i] == 1 for i in range(testlen)])))

0.9998554147187816
0.9999677205894221
0.9756944444444444


In [13]:
# generate metrics
from sklearn import metrics
print(metrics.accuracy_score(y_test, pred))
print(metrics.confusion_matrix(y_test, pred))
print(metrics.roc_auc_score(y_test, pred))

0.9998554147187816
[[61957     7]
 [    2   281]]
0.9964099466851035


In [14]:
# plot ROC curve
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

<Figure size 640x480 with 1 Axes>

In [None]:
#L1, L2 logistic regression
from sklearn import linear_model
best_alpha = -1
best_score = -1
for beta in np.arange(0.1, 1, 0.05):
    clf = linear_model.Lasso(alpha=beta).fit(X_train,y_train)
    #print(clf.score(X, y))
    y_hat = clf.predict(X_train)
    MSE = np.sum((y_hat - y_train)**2)/len(X_train)
    if clf.score(X_train, y_train) > best_score:
        best_alpha = beta
        best_score = clf.score(X_train,y_train)
print(best_score)
print(best_alpha)

In [None]:
# 5-fold cross validation
best_alpha = -1
best_score = -1
batch_size = int(X_val.shape[0] / 5)
for beta in np.arange(0.1, 1, 0.05):
    R_square = 0 # the r square value for each fold
    for i in range(5):
        curt_X = X_val[i * batch_size: min((i+1) * batch_size, X_val.shape[0]), :]
        curt_y = y_val[i * batch_size: min((i+1) * batch_size, X_val.shape[0])]
        clf = linear_model.Ridge(alpha=beta).fit(curt_X, curt_y)
        y_hat = clf.predict(curt_X)
        MSE = np.sum((y_hat - curt_y)**2) / batch_size
        R_square += clf.score(curt_X, curt_y)
    if R_square > best_score:
        best_alpha = beta
        best_score = R_square
print(best_score)
print(best_alpha)

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
# ElasticNet
from sklearn.linear_model import ElasticNet
best_alpha = -1
best_score = -1
batch_size = int(X_train.shape[0] / 5)
for beta in np.arange(0.1, 1, 0.05):
    for l1_val in np.arange(0.1, 1, 0.05):
        R_square = 0 # the r square value for each fold
        for i in range(5):
            curt_X = X_val[i * batch_size: min((i+1) * batch_size, X_val.shape[0]), :]
            curt_y = y_val[i * batch_size: min((i+1) * batch_size, X_val.shape[0])]
            clf = linear_model.Ridge(alpha=beta).fit(curt_X, curt_y)
            y_hat = clf.predict(curt_X)
            MSE = np.sum((y_hat - curt_y)**2) / batch_size
            R_square += clf.score(curt_X, curt_y)
    if R_square > best_score:
        best_alpha = beta
        best_score = R_square
print(best_score)
print(best_alpha)

In [None]:
#gradient boosted trees- to do rachel
from sklearn.ensemble import GradientBoostingRegressor
# 10-fold CV, with shuffle
kf_10 = model_selection.KFold( n_splits=10, shuffle=True, random_state=1)

gbr = sklearn.ensemble.GradientBoostingRegressor(random_state=0)
result = []
best_result = 10
best_params = None
best_r_score = None
for l in range(80, 100):
    for k in range(1, 5):
        for l_rate in np.arange(0.1, 1, 0.05):
            regressor = GradientBoostingRegressor(random_state=0, learning_rate = l_rate, n_estimators = l , max_depth = k)
            score = - model_selection.cross_val_score(regressor, X_train, y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
            r_score = - model_selection.cross_val_score(regressor, X_train, y_train, cv=kf_10).mean()
            if score < best_result:
                best_result = score
                best_params = (l, k)
                best_r_score = r_score
            result.append((score, (l, k, l_rate)))
print(best_result)
print(best_params)
print(best_r_score)