# Antifraud Project 1: Linear Models

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics

## 1. Data Preparation

Loading Data from CSV File

In [71]:
data = pd.read_csv("datacleaned.csv")

Picking Features:

In [72]:
features = ["emp_length_dummy","home_ownership_mortage","home_ownership_own",
            "home_ownership_rent","verification_status_verified","credit_history",
            "mths_since_last_delinq_d","mths_since_last_record_d","mths_since_rcnt_il_d",
            "mths_since_recent_bc_d","mths_since_recent_bc_dlq_d","mths_since_recent_inq_d",
            "mths_since_recent_revol_delinq_d","ln_tot_coll_amt","ln_tot_cur_bal","ln_tot_hi_cred_lim",
            "ln_total_acc","ln_total_bal_ex_mort","ln_total_bal_il","ln_total_bc_limit",
            "ln_total_cu_tl","ln_total_il_high_credit_limit","ln_total_rev_hi_lim","ln_loan_amnt",
            "ln_annual_inc","ln_delinq_2yrs","ln_mths_since_last_delinq","ln_mths_since_last_record",
            "ln_open_acc","ln_pub_rec","ln_revol_bal","ln_acc_now_delinq","ln_open_acc_6m",
            "ln_open_il_6m","ln_open_il_12m","ln_open_il_24m","ln_mths_since_rcnt_il",
            "ln_open_rv_12m","ln_open_rv_24m","ln_max_bal_bc","ln_all_util","ln_inq_fi",
            "ln_inq_last_12m","ln_acc_open_past_24mths","ln_avg_cur_bal","ln_bc_open_to_buy",
            "ln_chargeoff_within_12_mths","ln_delinq_amnt","ln_mo_sin_old_il_acct","ln_mo_sin_old_rev_tl_op",
            "ln_mo_sin_rcnt_rev_tl_op","ln_mo_sin_rcnt_tl","ln_mort_acc","ln_mths_since_recent_bc",
            "ln_mths_since_recent_bc_dlq","ln_mths_since_recent_inq","ln_num_accts_ever_120_pd",
            "ln_num_actv_bc_tl","ln_num_actv_rev_tl","ln_num_bc_sats","ln_num_bc_tl","ln_num_il_tl",
            "ln_num_op_rev_tl","ln_num_rev_accts","ln_num_rev_tl_bal_gt_0","ln_num_sats",
            "ln_num_tl_120dpd_2m","ln_num_tl_30dpd","ln_num_tl_90g_dpd_24m","ln_num_tl_op_past_12m",
            "ln_pub_rec_bankruptcies","ln_tax_liens","ln_mths_since_recent_revol_del"]
data = data.dropna(subset = features)

Normalization:

In [73]:
data_matrix = []
feature_no = ["emp_length_dummy","home_ownership_mortage",
             "home_ownership_own","home_ownership_rent","verification_status_verified",
             "mths_since_rcnt_il_d","mths_since_recent_bc_d"]
for feature in features:
    y = list(data.loc[:, feature])
    if feature not in feature_no:
        y = (y-np.mean(y))/np.std(y)  # normalization
        #print(feature,max(y))
    data_matrix.append(y)
data_matrix = np.mat(data_matrix)
data_matrix = data_matrix.T

Targets:

In [74]:
targets = list(data["loan_status_chargeoff"])
targets = np.mat(targets)
targets = targets.T
print(targets.shape,data_matrix.shape)

(17881, 1) (17881, 73)


Separating training set and test set:

In [75]:
sample_size = len(targets)
#shuffle the dataset
index = np.arange(sample_size)
np.random.shuffle(index)
data_matrix = data_matrix[index,:]
targets = targets[index]
#getting training set:
thres = int(np.floor(0.8*sample_size))
train_data = data_matrix[:thres]
train_targets = targets[:thres]
#getting testing set:
test_data = data_matrix[thres:]
test_targets = targets[thres:]

## 2. Ridge Regression

Tuning model by cross validation:

In [76]:
reg = linear_model.RidgeCV(alphas=[0.1,1,10,100,1000,10000])
reg.fit(train_data,train_targets)
print(reg.alpha_)

1000.0


Assessing the performance of tuned classifier on test set:

In [80]:
ridge_classifier = linear_model.Ridge(alpha=1000.0)
ridge_classifier.fit(train_data,train_targets)
y = ridge_classifier.predict(test_data)
y_pred = []
for y_pre in y:
    if y_pre>=0.04:#Best choice of threshold judging by AUC, highly unstable, too small and cannot be generalized
        y_pred.append(1)
    else:
        y_pred.append(0)
y_pred = np.mat(y_pred).T
precision = metrics.precision_score(test_targets,y_pred)
recall = metrics.recall_score(test_targets,y_pred)
auc = metrics.roc_auc_score(test_targets,y_pred)
print("precison: ",precision)
print("recall: ",recall)
print("auc: ",auc)

precison:  0.060495626822157436
recall:  0.6287878787878788
auc:  0.6273112108017768


## 3. Lasso Regression

Tuning model by cross validation:

In [81]:
reg = linear_model.LassoCV(alphas=[0.001,0.005,0.01,0.02,0.03,0.04,0.05])
reg.fit(train_data,train_targets)
print(reg.alpha_)

0.001


  y = column_or_1d(y, warn=True)


Assessing the performance of tuned classifier on test set:

In [86]:
lasso_classifier = linear_model.Lasso(alpha=0.001)
lasso_classifier.fit(train_data,train_targets)
y = lasso_classifier.predict(test_data)
y_pred = []
for y_pre in y:
    if y_pre>=0.035:#Best choice of threshold judging by AUC, highly unstable, too small and cannot be generalized
        y_pred.append(1)
    else:
        y_pred.append(0)
y_pred = np.mat(y_pred).T
precision = metrics.precision_score(test_targets,y_pred)
recall = metrics.recall_score(test_targets,y_pred)
auc = metrics.roc_auc_score(test_targets,y_pred)
print("precison: ",precision)
print("recall: ",recall)
print("auc: ",auc)

precison:  0.054037644201578625
recall:  0.6742424242424242
auc:  0.6109963935435634


## 4. Ridge Regression with Ensemble (Undersampling)

Reorganize training data:

In [87]:
pos_index = []
neg_index = []
for i in range(len(train_targets)):
    if train_targets[i] == 0:
        neg_index.append(i)
    else:
        pos_index.append(i)
print(len(pos_index),len(neg_index))

478 13826


Undersampling with ensemble:

In [156]:
predictions = []
for i in range(2000):#tuned
    threshold = 2*478#tuned
    np.random.shuffle(neg_index)
    selected_neg = neg_index[:threshold]
    #Creating the new training data:
    new_train_data = np.r_[train_data[pos_index,:],train_data[selected_neg,:]]
    new_train_targets = np.r_[train_targets[pos_index,:],train_targets[selected_neg,:]]
    #tune the model
    reg = linear_model.RidgeCV(alphas=[0.1,1,10,50,100,500,1000,5000,10000])
    reg.fit(new_train_data,new_train_targets)
    clf = linear_model.Ridge(alpha = reg.alpha_)
    clf.fit(new_train_data,new_train_targets)
    predictions.append(clf.predict(test_data))
y = np.mean(predictions,axis=0)

Assesing the performance:

In [160]:
y_pred = []
for y_pre in y:
    if y_pre>=0.32:#Now the threshold can be generalized and also makes more sense (stable)
        y_pred.append(1)
    else:
        y_pred.append(0)
y_pred = np.mat(y_pred).T
precision = metrics.precision_score(test_targets,y_pred)
recall = metrics.recall_score(test_targets,y_pred)
auc = metrics.roc_auc_score(test_targets,y_pred)
print("precison: ",precision)
print("recall: ",recall)
print("auc: ",auc)

precison:  0.05502958579881657
recall:  0.7045454545454546
auc:  0.620487531336588


## 5. Lasso Regression with Ensemle (undersampling)

In [173]:
predictions = []
for i in range(2000):#tuned
    threshold = 478#tuned
    np.random.shuffle(neg_index)
    selected_neg = neg_index[:threshold]
    #Creating the new training data:
    new_train_data = np.r_[train_data[pos_index,:],train_data[selected_neg,:]]
    new_train_targets = np.r_[train_targets[pos_index,:],train_targets[selected_neg,:]]
    #tune the model
    reg = linear_model.LassoCV(alphas=[0.001,0.005,0.01,0.02,0.03,0.04,0.05,0.1,0.5,1,5])
    reg.fit(new_train_data,new_train_targets)
    clf = linear_model.Lasso(alpha = reg.alpha_)
    clf.fit(new_train_data,new_train_targets)
    predictions.append(clf.predict(test_data))
y = np.mean(predictions,axis=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [175]:
y_pred = []
for y_pre in y:
    if y_pre>=0.5:#Now the threshold can be generalized and also makes more sense (stable)
        y_pred.append(1)
    else:
        y_pred.append(0)
y_pred = np.mat(y_pred).T
precision = metrics.precision_score(test_targets,y_pred)
recall = metrics.recall_score(test_targets,y_pred)
auc = metrics.roc_auc_score(test_targets,y_pred)
print("precison: ",precision)
print("recall: ",recall)
print("auc: ",auc)

precison:  0.05964912280701754
recall:  0.6439393939393939
auc:  0.6274849364472006


## 6. Review:

Linear models perform poorly due to their linear nature. They tend to give small prediction results given that the number of postive cases are too small compared to its negative counterparts, and thus the linear regressors tend to ignore positive cases. Ensembling linear models with undersampling solves the inbalanced data problem partly, but the performance of the models (judging by its precision, recall and accuracy) is still unsatisfactory and an informed guess would blame it on the linear neture of the models, since the relationship of our target and features are hardly linear.

I believe we can still improve the models' performance by (1)using more advanced ensemble methods, (2)giving postive cases more weights, but I don't think futher investigation on linear models are necessary since the performance will not improve significantly as long as the linear nature does not change. Thus, I suggest we treat linear models simply as a baseline methods and turn to other nonlinear classifiers.