In [1]:
# This chunk of code make it possible to use src functions
import sys
import os
module_path = os.path.abspath(os.path.join('../src'))

if module_path not in sys.path:
    sys.path.append(module_path)
import warnings
warnings.filterwarnings('ignore')
    
from dsproject import dsproject
import pandas as pd
from model import *

dsp = dsproject()



In [2]:
train = dsp.read_table(train_dataset + '_group1_prediction', 'output')
nongroup1_train = train.loc[train['group1_prediction']==0]

In [3]:
validate = dsp.read_table(validate_dataset + '_group1_prediction', 'output')
nongroup1_validate = validate.loc[validate['group1_prediction']==0]

In [4]:
score = dsp.read_table(score_dataset + '_group1_prediction', 'output')
nongroup1_score = score.loc[score['group1_prediction']==0]

In [5]:
x, y, col_to_drop_numeric, col_to_drop_cat = feature_selection(nongroup1_train, 'group5')
x_validate, y_validate, _, _ = feature_selection(nongroup1_validate, 'group5', col_to_drop_numeric, col_to_drop_cat)

['AR_RSK_GRD', 'LAST_DLQ_MO', 'CRN_CYC_AMT', 'NON_AUTO_PYMT_AMT']
['E_MAIL_F', 'AFF_MBR_TP_ID', 'DLQ_ST_ID', 'INCM_RNG_ID', 'AU_ID', 'IP_LCS_TP_ID', 'IS_PNP_CARD_F', 'NAT_ID', 'IDENTN_TP_ID', 'CST_TP_ID', 'PRVT_WLTH_F', 'IS_INACT_ST_P8M_F', 'IS_STFF_F', 'CARD_TP', 'GND_ID', 'RACE_ID', 'COLL_BR_NBR', 'IS_CLCB_F', 'OCP_ID', 'IS_CARD_VLD_F', 'CC_ST_ID', 'AFF_MBR_ORG_ID', 'SALUT_EN', 'NPL_F', 'MBL_PH_F']


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
import numpy as np

def balanced_subsample(y):
    subsample = []
    n_smp = len(y[y==1].index)
    
    for label in y.value_counts().index:
        samples = y[y == label].index.values
        index_range = range(samples.shape[0])
        indexes = np.random.choice(index_range, size=n_smp, replace=False)
        subsample += samples[indexes].tolist()

    return subsample

In [8]:
estimators = {}
nvoters = 5
for i in range(nvoters):
    subsample = balanced_subsample(y_train)
    x_sub = x_train.loc[subsample]
    y_sub = y_train.loc[subsample]
    model = RFC(n_estimators=50, n_jobs=-1, oob_score=True, min_samples_leaf=10)
    model.fit(x_sub, y_sub)
    estimators[i] = model
    print("Estimator " + str(i) + " Subsample")
    validate_model(estimators[i], x_sub, y_sub)
    print("Estimator " + str(i) + " Whole Set")
    validate_model(estimators[i], x_train, y_train)

Estimator 0 Subsample
             precision    recall  f1-score   support

          0       0.87      0.85      0.86      8202
          1       0.85      0.88      0.86      8202

avg / total       0.86      0.86      0.86     16404

Estimator 0 Whole Set
             precision    recall  f1-score   support

          0       0.99      0.76      0.86    101307
          1       0.23      0.88      0.36      8202

avg / total       0.93      0.77      0.82    109509

Estimator 1 Subsample
             precision    recall  f1-score   support

          0       0.88      0.84      0.86      8202
          1       0.85      0.89      0.87      8202

avg / total       0.86      0.86      0.86     16404

Estimator 1 Whole Set
             precision    recall  f1-score   support

          0       0.99      0.75      0.86    101307
          1       0.23      0.89      0.36      8202

avg / total       0.93      0.76      0.82    109509

Estimator 2 Subsample
             precision    reca

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix
def count_votes(x, y, threshold=0.75):
    votes = []
    ps = []
    for i in range(nvoters):
        y_hat = estimators[i].predict(x)
        votes.append(list(y_hat))
        p = estimators[i].predict_proba(x)
        ps.append(list(p))
    votes = np.array(votes, dtype=int)
    sum_votes = votes.sum(axis=0)/nvoters
    total_votes = np.array(sum_votes>threshold, dtype=int)
    total_ps = np.array(ps, dtype=float).mean(axis=0)
    if y is not None:
        accuracy_score(total_votes, y)

        result = classification_report(y, total_votes)
        print(result)
    return total_votes, total_ps

In [10]:
threshold = 0.2
print('Result : group5_voting test')
total_votes, total_ps = count_votes(x_test, y_test, threshold)
print('Result : group5_voting validate')
total_votes_validate, total_ps_validate = count_votes(x_validate, y_validate, threshold)

Result : group5_voting test
             precision    recall  f1-score   support

          0       0.98      0.72      0.83     25227
          1       0.20      0.83      0.33      2151

avg / total       0.92      0.73      0.79     27378

Result : group5_voting validate
             precision    recall  f1-score   support

          0       0.98      0.74      0.85     40315
          1       0.21      0.83      0.34      3372

avg / total       0.92      0.75      0.81     43687



In [11]:
print("Using only one classifier with validate set.")

yhat = estimators[1].predict(x_validate)
print(classification_report(y_validate, yhat))

Using only one classifier with validate set.
             precision    recall  f1-score   support

          0       0.98      0.76      0.86     40315
          1       0.22      0.80      0.34      3372

avg / total       0.92      0.76      0.82     43687



In [12]:
x_score, y_score, _, _ = feature_selection(nongroup1_score, None, col_to_drop_numeric, col_to_drop_cat)

In [13]:
total_votes_score, total_ps_score = count_votes(x_score, None, threshold)

In [21]:
total_votes, total_ps = count_votes(x, y, threshold)
nongroup1_train['group5_prediction'] = total_votes
nongroup1_train['group5_probability'] = total_ps[:,1]
dsp.write_table(nongroup1_train, train_dataset + '_group5_prediction', 'output')

             precision    recall  f1-score   support

          0       0.99      0.73      0.84    126534
          1       0.21      0.89      0.34     10353

avg / total       0.93      0.74      0.80    136887



In [14]:
nongroup1_validate['group5_prediction'] = total_votes_validate
nongroup1_validate['group5_probability'] = total_ps_validate[:,1]
dsp.write_table(nongroup1_validate, validate_dataset + '_group5_prediction', 'output')

In [15]:
nongroup1_score['group5_prediction'] = total_votes_score
nongroup1_score['group5_probability'] = total_ps_score[:,1]
dsp.write_table(nongroup1_score, score_dataset + '_group5_prediction', 'output')

In [16]:
make_feature_importance_table(estimators[1], x_train, 'feature_importance_group5')

                       column_name  feature_importance
110                     CR_ASES_ID            0.160033
45    NBR_DYS_NOT_PY_8DYS_LAST_1MO            0.053835
7              NBR_DLQ_ST_1_29_P3M            0.053470
33    NBR_DYS_NOT_PY_3DYS_LAST_1MO            0.039536
11                     AR_RSK_SCOR            0.032777
35                 CST_WST_RSK_IND            0.029431
78              CC_STMT_RET_RSN_ID            0.026393
30                 AR_RSK_PERF_IND            0.021698
20                 AV_PYMT_RTO_P6M            0.019592
15   NBR_DYS_NOT_PY_30DYS_LAST_8MO            0.017747
31        PYMT_PREV_BAL_AV_3MO_PCT            0.015882
87                       MISC_CODE            0.015495
27              AV_LMT_USG_RTO_P6M            0.013737
22                    CST_WST_SCOR            0.013364
32                   LAST_PRCH_DYS            0.013216


In [17]:
dsp.write_table(nongroup1_validate, validate_dataset + '_group5_prediction', 'output')

In [18]:
dsp.write_table(nongroup1_score, score_dataset + '_group5_prediction', 'output')