In [8]:
# libraries

import pandas as pd
import numpy as np

In [3]:
# read the train, validation data

X_train=pd.read_csv("train/X_train.txt")
y_train=pd.read_csv("train/y_train.txt")

X_val_1=pd.read_csv("val_1/X_val_1.txt")
y_val_1=pd.read_csv("val_1/y_val_1.txt")


In [4]:
#Percentage targets in training sample
(y_train['target'].sum()/y_train['target'].count())*100

4.761726219489333

### Logistic Regression using Lasso

In [5]:
from sklearn.linear_model import LogisticRegression
logl1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=101)
logl1.fit(X_train, y_train['target'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=101, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lasso_train_pred=logl1.predict_proba(X_val_1)
lasso_train_pred
predicted=[1 if i > 0.06 else 0 for i in lasso_train_pred[:,1]]

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_val_1,predicted))
print(classification_report(y_val_1,predicted))
print(accuracy_score(y_val_1, predicted))

from sklearn.metrics import roc_auc_score
roc_auc_score(y_val_1,predicted)

[[198680  60888]
 [  3944   9035]]
              precision    recall  f1-score   support

           0       0.98      0.77      0.86    259568
           1       0.13      0.70      0.22     12979

    accuracy                           0.76    272547
   macro avg       0.55      0.73      0.54    272547
weighted avg       0.94      0.76      0.83    272547

0.762125431576939


0.7307750695499717

### Logistic Regression using Ridge

In [18]:
#from sklearn.linear_model import LogisticRegression
logl2 = LogisticRegression(penalty='l2', solver='liblinear', random_state=101)
logl2.fit(X_train, y_train['target'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=101, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
ridge_train_pred=logl2.predict_proba(X_val_1)
predicted=[1 if i > 0.06 else 0 for i in ridge_train_pred[:,1]]

In [20]:
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_val_1,predicted))
print(classification_report(y_val_1,predicted))
print(accuracy_score(y_val_1, predicted))
roc_auc_score(y_val_1,predicted)

[[181495  78073]
 [  3676   9303]]
              precision    recall  f1-score   support

           0       0.98      0.70      0.82    259568
           1       0.11      0.72      0.19     12979

    accuracy                           0.70    272547
   macro avg       0.54      0.71      0.50    272547
weighted avg       0.94      0.70      0.79    272547

0.7000554032882402


0.7079963607243783

#### Based on validationa sample results, Lasso performs better in terms of accuracy and has a higher number of correctly predicted targets than Ridge

### Logistic Regression with RFE

In [118]:
#### Recursive Feature Elimination (RFE)

from sklearn.feature_selection import RFE
#from sklearn.linear_model import LogisticRegression
#log = LogisticRegression()
rfe = RFE(logl1, 20)
rfe = rfe.fit(X_train, y_train['target'])
print(rfe.support_)
print(rfe.ranking_)

[False  True False  True  True False  True  True  True  True  True False
  True  True  True False  True False  True  True False False False False
 False False False False  True  True  True  True  True  True]
[ 4  1  8  1  1  5  1  1  1  1  1  2  1  1  1 14  1 15  1  1 13 11 10 12
  7  3  6  9  1  1  1  1  1  1]


In [119]:
c1=rfe.ranking_

In [120]:
c2=X_train.columns
c2

Index(['ACH_IN_MTD_AMT', 'ACH_IN_MTD_QTY', 'ACH_OUT_MTD_AMT',
       'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY', 'DEBIT_CARD_MTD_AMT',
       'DEBIT_CARD_MTD_QTY', 'MOBILE_STD_DEP_QTY', '%diff_ACH_IN',
       'diff_ACH_IN_QTY', '%diff_ACH_OUT', '%diff_CHECK_WRITTEN',
       'diff_CHECK_WRITTEN', '%diff_DEBIT_CARD', 'diff_DEBIT_CARD_QTY',
       'CHK_WRITTEN_per_trans', 'inactive_months', 'CONS_LOAN_BAL_AMT',
       'CONS_LOAN_WAR_PCT', 'CONS_DEPOSIT_ACCT_QTY', 'CONS_DEPOSIT_BAL_AMT',
       'MORTGAGE_BAL_AMT', 'SAVINGS_BAL_AMT', 'CHECKING_BAL_AMT',
       'CREDIT_CARD_BAL_AMT', 'LAST_DIRECT_DEPOSIT_AMT', 'AVG_MONTHLY_BAL_AMT',
       'LAST_STMT_BAL_AMT', '%diff_AVG_MONTHLY_BAL', '%diff_LAST_STMT_BAL',
       'recency', 'DIRECT_DEP_IND_Y', 'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y'],
      dtype='object')

In [121]:
X_new=[]
for i in range(0,len(c1)):
    if c1[i]==1:
        X_new.append(c2[i])
        i=i+1

X_new

['ACH_IN_MTD_QTY',
 'ACH_OUT_MTD_QTY',
 'CHK_WRITTEN_MTD_QTY',
 'DEBIT_CARD_MTD_QTY',
 'MOBILE_STD_DEP_QTY',
 '%diff_ACH_IN',
 'diff_ACH_IN_QTY',
 '%diff_ACH_OUT',
 'diff_CHECK_WRITTEN',
 '%diff_DEBIT_CARD',
 'diff_DEBIT_CARD_QTY',
 'inactive_months',
 'CONS_LOAN_WAR_PCT',
 'CONS_DEPOSIT_ACCT_QTY',
 '%diff_AVG_MONTHLY_BAL',
 '%diff_LAST_STMT_BAL',
 'recency',
 'DIRECT_DEP_IND_Y',
 'ACTIVE_CHK_IND_Y',
 'HABITUAL_OD_IND_Y']

In [135]:
X_ref=X_train[X_new]
X_ref.shape

(80013, 20)

In [136]:
import statsmodels.api as sm

logit_model=sm.Logit(y_train['target'],X_ref)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.181275
         Iterations 9
                            Results: Logit
Model:                 Logit             Pseudo R-squared:  0.053     
Dependent Variable:    target            AIC:               29048.7189
Date:                  2020-04-13 19:28  BIC:               29234.5178
No. Observations:      80013             Log-Likelihood:    -14504.   
Df Model:              19                LL-Null:           -15318.   
Df Residuals:          79993             LLR p-value:       0.0000    
Converged:             1.0000            Scale:             1.0000    
No. Iterations:        9.0000                                         
----------------------------------------------------------------------
                       Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------------
ACH_IN_MTD_QTY        -0.1703   0.0122 -13.9191 0.0000 -0.1943 -0.1463
ACH

In [137]:
# drop variables that are statistically insignificant (p>0.005)
# %diff_ACH_IN,%diff_DEBIT_CARD, %diff_ACH_OUT
X_ref2=X_ref.drop(['%diff_ACH_IN','%diff_DEBIT_CARD', '%diff_ACH_OUT'], axis=1)

In [138]:
logit_model2=sm.Logit(y_train['target'],X_ref2)
result=logit_model2.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.181304
         Iterations 9
                            Results: Logit
Model:                 Logit             Pseudo R-squared:  0.053     
Dependent Variable:    target            AIC:               29047.4245
Date:                  2020-04-13 19:28  BIC:               29205.3535
No. Observations:      80013             Log-Likelihood:    -14507.   
Df Model:              16                LL-Null:           -15318.   
Df Residuals:          79996             LLR p-value:       0.0000    
Converged:             1.0000            Scale:             1.0000    
No. Iterations:        9.0000                                         
----------------------------------------------------------------------
                       Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------------
ACH_IN_MTD_QTY        -0.1723   0.0121 -14.2006 0.0000 -0.1961 -0.1486
ACH

In [126]:
X_ref2.columns

Index(['ACH_IN_MTD_QTY', 'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY',
       'DEBIT_CARD_MTD_QTY', 'MOBILE_STD_DEP_QTY', 'diff_ACH_IN_QTY',
       'diff_CHECK_WRITTEN', 'diff_DEBIT_CARD_QTY', 'inactive_months',
       'CONS_LOAN_WAR_PCT', 'CONS_DEPOSIT_ACCT_QTY', '%diff_AVG_MONTHLY_BAL',
       '%diff_LAST_STMT_BAL', 'recency', 'DIRECT_DEP_IND_Y',
       'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y'],
      dtype='object')

In [127]:
X_val1_rfe=X_val_1[['ACH_IN_MTD_QTY', 'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY',
       'DEBIT_CARD_MTD_QTY', 'MOBILE_STD_DEP_QTY', 'diff_ACH_IN_QTY',
       'diff_CHECK_WRITTEN', 'diff_DEBIT_CARD_QTY', 'inactive_months',
       'CONS_LOAN_WAR_PCT', 'CONS_DEPOSIT_ACCT_QTY', '%diff_AVG_MONTHLY_BAL',
       '%diff_LAST_STMT_BAL', 'recency', 'DIRECT_DEP_IND_Y',
       'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y']]

In [128]:
logl1.fit(X_ref2, y_train['target'])
# predict on validation sample
rfe_train_pred=logl1.predict_proba(X_val1_rfe)
predicted=[1 if i > 0.04 else 0 for i in rfe_train_pred[:,1]]

In [81]:
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_val_1,predicted))
print(classification_report(y_val_1,predicted))
print('Accuracy')
print(accuracy_score(y_val_1, predicted))
print('AUC')
print(roc_auc_score(y_val_1, predicted))

[[165730  93838]
 [  2797  10182]]
              precision    recall  f1-score   support

           0       0.98      0.64      0.77    259568
           1       0.10      0.78      0.17     12979

    accuracy                           0.65    272547
   macro avg       0.54      0.71      0.47    272547
weighted avg       0.94      0.65      0.75    272547

Accuracy
0.6454373007224442
AUC
0.711490988919236


#### l1 and l2 give comparable accuracy/recall

In [129]:
# test data
X_y_test=pd.read_csv("test/X_y_test.txt")
X_test=X_y_test[['ACH_IN_MTD_QTY', 'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY',
       'DEBIT_CARD_MTD_QTY', 'MOBILE_STD_DEP_QTY', 'diff_ACH_IN_QTY',
       'diff_CHECK_WRITTEN', 'diff_DEBIT_CARD_QTY', 'inactive_months',
       'CONS_LOAN_WAR_PCT', 'CONS_DEPOSIT_ACCT_QTY', '%diff_AVG_MONTHLY_BAL',
       '%diff_LAST_STMT_BAL', 'recency', 'DIRECT_DEP_IND_Y',
       'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y']]
y_test=X_y_test[['target']]

In [130]:
rfe_test_pred=logl1.predict_proba(X_test)
predicted=[1 if i > 0.05 else 0 for i in rfe_test_pred[:,1]]

In [131]:
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test,predicted))
print('Accuracy')
print(accuracy_score(y_test, predicted))
print('AUC')
print(roc_auc_score(y_test, predicted))

[[529018 212997]
 [  2488   6446]]
              precision    recall  f1-score   support

           0       1.00      0.71      0.83    742015
           1       0.03      0.72      0.06      8934

    accuracy                           0.71    750949
   macro avg       0.51      0.72      0.44    750949
weighted avg       0.98      0.71      0.82    750949

Accuracy
0.7130497543774611
AUC
0.7172305856800142


### 15, 10 variables for RFE (using lasso)

In [100]:
#### Recursive Feature Elimination (RFE)

#from sklearn.feature_selection import RFE
#from sklearn.linear_model import LogisticRegression
#log = LogisticRegression()
rfe = RFE(logl1, 10)
rfe = rfe.fit(X_train, y_train['target'])
print(rfe.support_)
print(rfe.ranking_)

[False  True False  True  True False False  True  True  True False False
 False False False False False False False False False False False False
 False False False False  True False False  True  True  True]
[14  1 18  1  1 15  9  1  1  1  3 12  4  5  6 24  2 25  8  7 23 21 20 22
 17 13 16 19  1 11 10  1  1  1]


In [101]:
c1=rfe.ranking_

In [102]:
c2=X_train.columns
c2

Index(['ACH_IN_MTD_AMT', 'ACH_IN_MTD_QTY', 'ACH_OUT_MTD_AMT',
       'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY', 'DEBIT_CARD_MTD_AMT',
       'DEBIT_CARD_MTD_QTY', 'MOBILE_STD_DEP_QTY', '%diff_ACH_IN',
       'diff_ACH_IN_QTY', '%diff_ACH_OUT', '%diff_CHECK_WRITTEN',
       'diff_CHECK_WRITTEN', '%diff_DEBIT_CARD', 'diff_DEBIT_CARD_QTY',
       'CHK_WRITTEN_per_trans', 'inactive_months', 'CONS_LOAN_BAL_AMT',
       'CONS_LOAN_WAR_PCT', 'CONS_DEPOSIT_ACCT_QTY', 'CONS_DEPOSIT_BAL_AMT',
       'MORTGAGE_BAL_AMT', 'SAVINGS_BAL_AMT', 'CHECKING_BAL_AMT',
       'CREDIT_CARD_BAL_AMT', 'LAST_DIRECT_DEPOSIT_AMT', 'AVG_MONTHLY_BAL_AMT',
       'LAST_STMT_BAL_AMT', '%diff_AVG_MONTHLY_BAL', '%diff_LAST_STMT_BAL',
       'recency', 'DIRECT_DEP_IND_Y', 'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y'],
      dtype='object')

In [103]:
X_new=[]
for i in range(0,len(c1)):
    if c1[i]==1:
        X_new.append(c2[i])
        i=i+1

X_new

['ACH_IN_MTD_QTY',
 'ACH_OUT_MTD_QTY',
 'CHK_WRITTEN_MTD_QTY',
 'MOBILE_STD_DEP_QTY',
 '%diff_ACH_IN',
 'diff_ACH_IN_QTY',
 '%diff_AVG_MONTHLY_BAL',
 'DIRECT_DEP_IND_Y',
 'ACTIVE_CHK_IND_Y',
 'HABITUAL_OD_IND_Y']

In [104]:
X_ref=X_train[X_new]
X_ref.shape

(80013, 10)

In [105]:
import statsmodels.api as sm

logit_model=sm.Logit(y_train['target'],X_ref)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.214417
         Iterations 9
                            Results: Logit
Model:                 Logit             Pseudo R-squared:  -0.120    
Dependent Variable:    target            AIC:               34332.3168
Date:                  2020-04-13 17:45  BIC:               34425.2163
No. Observations:      80013             Log-Likelihood:    -17156.   
Df Model:              9                 LL-Null:           -15318.   
Df Residuals:          80003             LLR p-value:       1.0000    
Converged:             1.0000            Scale:             1.0000    
No. Iterations:        9.0000                                         
----------------------------------------------------------------------
                       Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------------
ACH_IN_MTD_QTY        -0.2088   0.0123 -16.9133 0.0000 -0.2329 -0.1846
ACH

In [106]:
X_ref2=X_ref.drop(['%diff_ACH_IN'], axis=1)
logit_model2=sm.Logit(y_train['target'],X_ref2)
result=logit_model2.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.214418
         Iterations 9
                            Results: Logit
Model:                 Logit             Pseudo R-squared:  -0.120    
Dependent Variable:    target            AIC:               34330.4694
Date:                  2020-04-13 17:45  BIC:               34414.0789
No. Observations:      80013             Log-Likelihood:    -17156.   
Df Model:              8                 LL-Null:           -15318.   
Df Residuals:          80004             LLR p-value:       1.0000    
Converged:             1.0000            Scale:             1.0000    
No. Iterations:        9.0000                                         
----------------------------------------------------------------------
                       Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------------
ACH_IN_MTD_QTY        -0.2080   0.0122 -17.0549 0.0000 -0.2320 -0.1841
ACH

In [107]:
X_ref2.columns

Index(['ACH_IN_MTD_QTY', 'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY',
       'MOBILE_STD_DEP_QTY', 'diff_ACH_IN_QTY', '%diff_AVG_MONTHLY_BAL',
       'DIRECT_DEP_IND_Y', 'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y'],
      dtype='object')

In [108]:
X_val1_rfe=X_val_1[['ACH_IN_MTD_QTY', 'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY',
       'MOBILE_STD_DEP_QTY', 'diff_ACH_IN_QTY', '%diff_AVG_MONTHLY_BAL',
       'DIRECT_DEP_IND_Y', 'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y']]

In [109]:
logl1.fit(X_ref2, y_train['target'])
# predict on validation sample
rfe_train_pred=logl1.predict_proba(X_val1_rfe)
predicted=[1 if i > 0.05 else 0 for i in rfe_train_pred[:,1]]

In [110]:
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_val_1,predicted))
print(classification_report(y_val_1,predicted))
print('Accuracy')
print(accuracy_score(y_val_1, predicted))
print('AUC')
print(roc_auc_score(y_val_1, predicted))

[[184053  75515]
 [  3785   9194]]
              precision    recall  f1-score   support

           0       0.98      0.71      0.82    259568
           1       0.11      0.71      0.19     12979

    accuracy                           0.71    272547
   macro avg       0.54      0.71      0.51    272547
weighted avg       0.94      0.71      0.79    272547

Accuracy
0.7090410094405736
AUC
0.7087246877488577


In [111]:
# test data
#X_y_test=pd.read_csv("test/X_y_test.txt")
X_test=X_y_test[['ACH_IN_MTD_QTY', 'ACH_OUT_MTD_QTY', 'CHK_WRITTEN_MTD_QTY',
       'MOBILE_STD_DEP_QTY', 'diff_ACH_IN_QTY', '%diff_AVG_MONTHLY_BAL',
       'DIRECT_DEP_IND_Y', 'ACTIVE_CHK_IND_Y', 'HABITUAL_OD_IND_Y']]
y_test=X_y_test[['target']]

In [112]:
# predict on test sample
rfe_test_pred=logl1.predict_proba(X_test)
predicted=[1 if i > 0.05 else 0 for i in rfe_test_pred[:,1]]

In [113]:
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test,predicted))
print('Accuracy')
print(accuracy_score(y_test, predicted))
print('AUC')
print(roc_auc_score(y_test, predicted))

[[525543 216472]
 [  2622   6312]]
              precision    recall  f1-score   support

           0       1.00      0.71      0.83    742015
           1       0.03      0.71      0.05      8934

    accuracy                           0.71    750949
   macro avg       0.51      0.71      0.44    750949
weighted avg       0.98      0.71      0.82    750949

Accuracy
0.708243835466856
AUC
0.7073895484717532
