# National Model 3a

- Equifax, TU, Experian National model 
- Features selection on national data
- Updated time split on Equifax
    - Training: 2016-04-01 to 2020-12-31
    - Valid: 2021-01-01 to 2021-03-31

The notebook covers all three steps for creatign a new national model; feature selection, validation model and fold-in model. depending on the applied use case, not each step may be necessary (e.g., if the feature selection does not change compaed to the previous interation, step 1 may be skipped, etc.

In [1]:
import pandas as pd
import numpy as np

import model_engine
model_engine.__version__

'v1.13.0'

In [2]:
from model_engine.power.post_sale import NationalModelBuilder
from model_engine.io.loaders import load_json

import json


Pending Deprecation in Numba 0.57.0. For more information please see: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-the-numba-pycc-module
  from numba.pycc import CC
  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices

In [3]:
import numba

# import os
# n_jobs = os.getenv('CPU_LIMIT', default=-1)

n_jobs = 40
print(n_jobs)
numba.set_num_threads(n_jobs)

40


In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
input_client_name = 'penfed'
input_project_name = 'autoloan'
input_bureau_name = 'normalized'
model_type = 'standard_model_member_data' # options are 'standard_model', 'mega_model', 'standard_model_member_data', 'standard_model_ln_data', ''- None defaults to 'standard_model'

print(input_client_name, input_project_name, input_bureau_name, model_type)


penfed autoloan normalized standard_model_member_data


## Step 1: Create a feautre selection model

In [6]:
# client' feature selection model

feature_selection_model_id = 'nationalmodel3a'

In [7]:
feature_selection_model_output_path = f'/d/shared/users/lyt/penfed_custom_model_national_models/{feature_selection_model_id}'
feature_selection_model_output_path

'/d/shared/users/lyt/penfed_custom_model_national_models/nationalmodel3a'

In [8]:
config = {
  "model_type": "feature_selection",
  "storage_location": "s3",
  "lookalike": {
    "bureau": "normalized",
    "accounts":[
      "auto"
    ],
    "splits": [
      "train",
      "valid",
      "test"
    ],
    "pull_names": None,
    "states": None,
    "max_row": 1400000,
    "tables": ['trade', 'inq', 'bnkr', 'collec', 'target', 'app', 'member'],
  },
  "model_config": {
      "data_split": {
          "train": {
            "start_date": "2016-04-01",
            "end_date": "2021-01-01"
          },
          "valid": {
            "start_date": "2021-01-01",
            "end_date": "2021-04-01"
          },
          "test": {
            "start_date": "2021-04-01",
            "end_date": "2022-01-01"
          }
    },
    "target": "final_DQ60_m24",
#     "base_features": "", 
    "feature_filter_rules": "/home/lyt/CLIENTS/client-project-penfed/modeling/national_models/normalized_intuitive_features.json",
    "memory_efficient": True,
    "pipeline_factory": None,
    "bivariate_fe_instructions": [],
    "monotonic_constraints_list": [],
    "exclusion_list": [
      {
        "feature": "inq_inquiries__count_by_month__sum",
        "reason": "invalid"
      },
      {
        "feature": "inq_inquiries__count_by_month__var",
        "reason": "invalid"
      }
    ],
    "feature_definition_list": [],
    "key_factor_mapping_list": [],
    "fold_valid": False,
    "mandatory_features": [
      "trade_months_since_openDate__max",
      "trade_acctType__count",
      "trade_number_DQ30_24_months__sum",
      "trade_number_DQ60_24_months__sum",
      "trade_number_DQ90_24_months__sum",
      "trade_blncAmt_all_derog_accts__max",
      "trade_blncAmt_active_accts__sum",
      "trade_blncAmt_active_accts__sum_by_prtfType_revolving",
      "inq_inquiries__count_by_indCode_auto_mortgage",
      "inq_inquiries__count_by_month__max",
      "inq_inquiries__count_by_month__mean",
      "inq_inquiries__sum",
      "inq_inquiries_in_last_3_months__sum",
      "inq_inquiries_in_last_6_months__sum",
      "inq_inquiries_in_last_9_months__sum",
      "inq_inquiries_in_last_month__sum",
      "inq_inquiries_in_last_year__sum",
      "inq_months_since_INQUIRY_DATE__max",
      "inq_months_since_INQUIRY_DATE__mean",
      "inq_months_since_INQUIRY_DATE__min",
      "inq_months_since_INQUIRY_DATE__sum",
      "collec_blnc__sum",
      "collec_months_since_collection_opened__min",
      "collec___count",
      "collec_ratio_paid__max",
      "collec_ratio_paid__min",
      "bnkr___count",
      "bnkr___count_by_type_chapter_13_discharged",
      "bnkr___count_by_type_chapter_13_dismissed",
      "bnkr___count_by_type_chapter_7_discharged",
      "bnkr___count_by_type_chapter_7_dismissed",
      "bnkr_months_since_date_filed__max",
      "bnkr_months_since_date_filed__mean",
      "bnkr_months_since_date_filed__min",
      "member_hcAmt_accts_never_dq__mean",
      "member_hcAmt__max_by_prtfType_revolving",
      "member_hcAmt__max_by_prtfType_installment",
      "member_hcAmt__sum_by_prtfType_installment",
      "member_hcAmt_accts_never_dq__max_by_prtfType_installment",
      "member_hcAmt_accts_never_dq__max_by_prtfType_revolving",
      "member_hcAmt_accts_opened_12m__sum_by_prtfType_installment",
      "member_hcAmt_accts_opened_12m__sum_by_prtfType_revolving",
      "member_hcAmt_active_accts__sum_by_prtfType_installment",
      "member_hcAmt_active_accts__sum_by_prtfType_revolving",
      "member_months_since_openDate__max_by_ecoa_individual",
      "member_hcAmt_active_accts__mean_by_ecoa_individual",
      "member_blnc_to_hc__max_by_prtfType_revolving",
      "member_blnc_to_hc__max_by_prtfType_installment",
      "member_blnc_to_hc_accts_never_dq__max_by_prtfType_revolving",
      "member_blnc_to_hc_accts_never_dq__max_by_prtfType_installment",
      "member_months_since_openDate__min",
      "member_months_since_openDate__max",
      "member_months_since_openDate__mean",
      "member_number_DQ60_24_months__sum",
      "member_number_DQ30_24_months__sum",
      "member_number_DQ60_12_months__sum",
      "member_number_DQ30_12_months__sum",
      "member_number_CO_24_months__sum",
      "member_acctType__count",
      "member_blncAmt__sum_by_acctType_unsecure",
      "member_blncAmt__sum_by_prtfType_revolving",
      "member_blncAmt__sum_by_prtfType_installment"
    ]
  }
}

In [9]:
nmb = NationalModelBuilder(
    configuration=config,
    model_output_path=feature_selection_model_output_path,
    overwrite_model_output=True
)
nmb.run()

Attempting to create a national model here /d/shared/users/lyt/penfed_custom_model_national_models/nationalmodel3a
Sampling experian, national_1.3
Sampling experian, national_2
Sampling equifax, national_1.1
Sampling equifax, national_2
Sampling equifax, national_1_home_equity
Sampling equifax, national_3
Sampling transunion, national_3_refresh
Sampling transunion, national_4
Sampling experian, national_1.3
Sampling experian, national_2
Sampling equifax, national_1.1
Sampling equifax, national_2
Sampling equifax, national_1_home_equity
Sampling equifax, national_3
Sampling transunion, national_3_refresh
Sampling transunion, national_4
Sampling experian, national_1.3
Sampling experian, national_2
Sampling equifax, national_1.1
Sampling equifax, national_2
Sampling equifax, national_1_home_equity
Sampling equifax, national_3
Sampling transunion, national_3_refresh
Sampling transunion, national_4



severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound o

Max number of rows allowed is: 1400000
Total number of keys: 44043889
Auto sample fraction: 0.03178647553125929
parsing model-builder asset




severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound 

Configuring model builder


INFO:zaml.artifact_engine.logger:Executing InputArtifact <input_asset>...
INFO:zaml.artifact_engine.logger:Executing InputArtifact <input_data>...
INFO:zaml.artifact_engine.logger:Executing InputArtifact <monotonic_constraints_list>...
INFO:zaml.artifact_engine.logger:Executing InputArtifact <data_split>...
INFO:zaml.artifact_engine.logger:Executing InputArtifact <train_sample_weight>...
INFO:zaml.artifact_engine.logger:Not all required inputs are available for optional artifact <train_sample_weight>, thus it will be omitted.
INFO:zaml.artifact_engine.logger:Executing InputArtifact <valid_sample_weight>...
INFO:zaml.artifact_engine.logger:Not all required inputs are available for optional artifact <valid_sample_weight>, thus it will be omitted.
INFO:zaml.artifact_engine.logger:Executing InputArtifact <test_sample_weight>...
INFO:zaml.artifact_engine.logger:Not all required inputs are available for optional artifact <test_sample_weight>, thus it will be omitted.
INFO:zaml.artifact_engin

building model


INFO:zaml.artifact_engine.logger:Finished <versions>, total time spent: 0:00:17.602571
INFO:zaml.artifact_engine.logger:Executing MonotonicConstraintsListParser <parsed_monotonic_constraints_list>...
INFO:zaml.artifact_engine.logger:Finished <parsed_monotonic_constraints_list>, total time spent: 0:00:00.258825
INFO:zaml.artifact_engine.logger:Executing SplitterArtifact <splitter>...
INFO:zaml.artifact_engine.logger:Finished <splitter>, total time spent: 0:00:00.000668
INFO:zaml.artifact_engine.logger:Executing DataArtifact <data>...
INFO:zaml.artifact_engine.logger:Finished <data>, total time spent: 0:00:00.000402
INFO:zaml.artifact_engine.logger:Executing ExclusionListParser <parsed_exclusion_list>...
INFO:zaml.artifact_engine.logger:Finished <parsed_exclusion_list>, total time spent: 0:00:00.001587
INFO:zaml.artifact_engine.logger:Executing BivariateFeParser <parsed_biv_fe_instructions>...
INFO:zaml.artifact_engine.logger:Finished <parsed_biv_fe_instructions>, total time spent: 0:00:

-------------------------
Name: app
Transformer type: None
Number of features: 18
Time spent: 0.059s
-------------------------
Name: bnkr
Transformer type: None
Number of features: 8
Time spent: 0.019s
-------------------------
Name: collec
Transformer type: None
Number of features: 5
Time spent: 0.016s
-------------------------
Name: inq
Transformer type: None
Number of features: 17
Time spent: 0.017s
-------------------------
Name: member
Transformer type: None
Number of features: 8464
Time spent: 0.063s
-------------------------
Name: trade
Transformer type: None
Number of features: 8464
Time spent: 0.066s
-------------------------
Name: app FE
Transformer type: OneToOneEngine
Number of features: 0
Time spent: 0.079s
-------------------------
Name: bnkr FE
Transformer type: ManyToOneEngine
Number of features: 8
Time spent: 0.050s
-------------------------
Name: collec FE
Transformer type: ManyToOneEngine
Number of features: 5
Time spent: 0.050s
-------------------------
Name: inq FE





-------------------------
Name: GainFeatureSelection
Transformer type: GainFeatureSelection
Number of features: 290
Time spent: 8165.936s
-------------------------
Name: Drop Engineered Features
Transformer type: GeneralFeatureSelection
Number of features: 290
Time spent: 0.003s


INFO:zaml.artifact_engine.logger:Finished <pipeline_fitter>, total time spent: 2:42:07.171576
INFO:zaml.artifact_engine.logger:Executing FittedPipeline <pipeline>...
INFO:zaml.artifact_engine.logger:Finished <pipeline>, total time spent: 0:00:00.000727
INFO:zaml.artifact_engine.logger:Executing FitTimeInfoArtifact <fit_time_info>...
INFO:zaml.artifact_engine.logger:Finished <fit_time_info>, total time spent: 0:00:00.000597
INFO:zaml.artifact_engine.logger:Executing PipeFactoryArtifact <pipe_factory>...
INFO:zaml.artifact_engine.logger:Finished <pipe_factory>, total time spent: 0:00:00.000590
INFO:zaml.artifact_engine.logger:Executing FittedModel <model>...
INFO:zaml.artifact_engine.logger:Finished <model>, total time spent: 0:00:00.000425
INFO:zaml.artifact_engine.logger:Executing FeDataArtifact <train_fe_data>...


-------------------------
Name: app
Transformer type: None
Number of features: 18
Time spent: 0.000s
-------------------------
Name: bnkr
Transformer type: None
Number of features: 8
Time spent: 0.000s
-------------------------
Name: collec
Transformer type: None
Number of features: 5
Time spent: 0.000s
-------------------------
Name: inq
Transformer type: None
Number of features: 17
Time spent: 0.000s
-------------------------
Name: member
Transformer type: None
Number of features: 8464
Time spent: 0.005s
-------------------------
Name: trade
Transformer type: None
Number of features: 8464
Time spent: 0.001s
-------------------------
Name: app FE
Transformer type: OneToOneEngine
Number of features: 0
Time spent: 0.049s
-------------------------
Name: bnkr FE
Transformer type: ManyToOneEngine
Number of features: 8
Time spent: 0.018s
-------------------------
Name: collec FE
Transformer type: ManyToOneEngine
Number of features: 5
Time spent: 0.018s
-------------------------
Name: inq FE


INFO:zaml.artifact_engine.logger:Finished <train_fe_data>, total time spent: 0:03:46.885572
INFO:zaml.artifact_engine.logger:Executing FeDataArtifact <valid_fe_data>...


-------------------------
Name: GainFeatureSelection
Transformer type: GainFeatureSelection
Number of features: 290
Time spent: 0.207s
-------------------------
Name: Drop Engineered Features
Transformer type: GeneralFeatureSelection
Number of features: 290
Time spent: 0.001s
-------------------------
Name: app
Transformer type: None
Number of features: 18
Time spent: 0.000s
-------------------------
Name: bnkr
Transformer type: None
Number of features: 8
Time spent: 0.000s
-------------------------
Name: collec
Transformer type: None
Number of features: 5
Time spent: 0.000s
-------------------------
Name: inq
Transformer type: None
Number of features: 17
Time spent: 0.000s
-------------------------
Name: member
Transformer type: None
Number of features: 8464
Time spent: 0.001s
-------------------------
Name: trade
Transformer type: None
Number of features: 8464
Time spent: 0.001s
-------------------------
Name: app FE
Transformer type: OneToOneEngine
Number of features: 0
Time spent: 


INFO:zaml.artifact_engine.logger:Finished <valid_fe_data>, total time spent: 0:00:47.524065
INFO:zaml.artifact_engine.logger:Executing FeDataArtifact <test_fe_data>...


-------------------------
Name: FillNA
Transformer type: FillNA
Number of features: 5230
Time spent: 44.344s
-------------------------
Name: CorrelationFeatureSelection
Transformer type: CorrelationFeatureSelection
Number of features: 3338
Time spent: 0.081s
-------------------------
Name: GainFeatureSelection
Transformer type: GainFeatureSelection
Number of features: 290
Time spent: 0.026s
-------------------------
Name: Drop Engineered Features
Transformer type: GeneralFeatureSelection
Number of features: 290
Time spent: 0.001s
-------------------------
Name: app
Transformer type: None
Number of features: 18
Time spent: 0.000s
-------------------------
Name: bnkr
Transformer type: None
Number of features: 8
Time spent: 0.000s
-------------------------
Name: collec
Transformer type: None
Number of features: 5
Time spent: 0.000s
-------------------------
Name: inq
Transformer type: None
Number of features: 17
Time spent: 0.000s
-------------------------
Name: member
Transformer type: N


INFO:zaml.artifact_engine.logger:Finished <test_fe_data>, total time spent: 0:00:50.005628
INFO:zaml.artifact_engine.logger:Executing StaticAssetArtifact <static_asset>...


-------------------------
Name: FillNA
Transformer type: FillNA
Number of features: 5230
Time spent: 45.135s
-------------------------
Name: CorrelationFeatureSelection
Transformer type: CorrelationFeatureSelection
Number of features: 3338
Time spent: 0.129s
-------------------------
Name: GainFeatureSelection
Transformer type: GainFeatureSelection
Number of features: 290
Time spent: 0.030s
-------------------------
Name: Drop Engineered Features
Transformer type: GeneralFeatureSelection
Number of features: 290
Time spent: 0.001s


INFO:zaml.artifact_engine.logger:Finished <static_asset>, total time spent: 0:00:03.561945
INFO:zaml.artifact_engine.logger:Executing TrainHistoryArtifact <train_history>...
INFO:zaml.artifact_engine.logger:Finished <train_history>, total time spent: 0:00:00.000616
INFO:zaml.artifact_engine.logger:Executing BestModelParamsArtifact <best_model_params>...
INFO:zaml.artifact_engine.logger:Finished <best_model_params>, total time spent: 0:00:00.000337
INFO:zaml.artifact_engine.logger:Executing ScoresArtifact <train_scores>...
INFO:zaml.artifact_engine.logger:Finished <train_scores>, total time spent: 0:00:01.297962
INFO:zaml.artifact_engine.logger:Executing SubmodelScoresArtifact <train_submodel_scores>...
INFO:zaml.artifact_engine.logger:Finished <train_submodel_scores>, total time spent: 0:00:00.000427
INFO:zaml.artifact_engine.logger:Executing ScoresArtifact <valid_scores>...
INFO:zaml.artifact_engine.logger:Finished <valid_scores>, total time spent: 0:00:00.062945
INFO:zaml.artifact_en

In [11]:
ftr_importance = pd.read_parquet(os.path.join(feature_selection_model_output_path, 'feature_importance.parquet'))
ftr_importance.shape

(290, 2)

In [12]:
scoreFiles = []

for j in ['_auc.json','_ks.json']:
    for i in ['train', 'test']:
        scoreFiles.append(i+j)
        
scoreFiles_model = {}
for i in scoreFiles:
    name = i.split('.')[0]
    # print(name)
    scoreFiles_model[name] = json.load(open(os.path.join(feature_selection_model_output_path, i), 'rb'))


for i in scoreFiles_model:
    print(i, scoreFiles_model[i])

train_auc {'auc': 0.867771515412307}
test_auc {'auc': 0.859232808486821}
train_ks {'ks': 0.5796978413022646}
test_ks {'ks': 0.5745629557004234}


In [11]:
scoreFiles = []

for j in ['_auc.json','_ks.json']:
    for i in ['train', 'test']:
        scoreFiles.append(i+j)
        
scoreFiles_model = {}
for i in scoreFiles:
    name = i.split('.')[0]
    # print(name)
    scoreFiles_model[name] = json.load(open(os.path.join(feature_selection_model_output_path, i), 'rb'))


for i in scoreFiles_model:
    print(i, scoreFiles_model[i])

train_auc {'auc': 0.8686551218870335}
test_auc {'auc': 0.8535502746706296}
train_ks {'ks': 0.5812319389933264}
test_ks {'ks': 0.5624095655404407}


In [3]:
intui = json.load(open("/home/lyt/CODEBASE/model-engine_penfed/model-engine/model_engine/assets/power/feature_rules/intuitive_features.json", "r"))
intui

[{'name': 'drop_48m_features',
  'drop': {'operation': 'contains', 'logic': 'or', 'keywords': ['48']}},
 {'name': 'drop_sum_features',
  'among': {'operation': 'contains',
   'logic': 'or',
   'keywords': ['trade_crdUtl',
    'trade_months_since_openDate',
    'trade_hc_to_cl',
    'trade_pstDue_to_blnc',
    'trade_pstDue_to_hc',
    'trade_pstDue_to_cl',
    'trade_percent_DQ',
    'trade_percent_CO']},
  'drop': {'operation': 'contains', 'logic': 'or', 'keywords': ['_sum']}},
 {'name': 'keep_only_revolving',
  'among': {'operation': 'contains',
   'logic': 'or',
   'keywords': ['trade_crdUtl', 'trade_clAmt', 'trade_hc_to_cl']},
  'keep': {'operation': 'contains',
   'logic': 'or',
   'keywords': ['_by_prtfType_revolving',
    '_by_acctType_credit_card',
    '_by_acctType_charge_card']}},
 {'name': 'keep_only_by_accountportfolio',
  'among': {'operation': 'contains',
   'logic': 'or',
   'keywords': ['trade_pstDue_to_hc',
    'trade_blnc_to_hc',
    'trade_pstDue_to_cl']},
  'keep': 