# Client Data Only Model clientmodel3a

- feature selection on PenFed data +org data
- with LTV application feature
- train test split: 
  "train": {"start_date": "2021-07-01", "end_date": "2022-07-01"},
  "valid": {"start_date": "2022-07-01", "end_date": "2023-01-01"},
  "test": {"start_date": "2023-01-01", "end_date": "2024-01-01"}

In [1]:
# House keeping settings
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi=False

In [2]:
import os
import pandas as pd
import json
import numpy as np

from model_engine.assets.utils import load_asset
from model_engine.io.loaders import load_json
from model_engine.power.post_sale import PowerModelBuilder
from model_engine.analysis.suggesters import evaluate_datesplits
import warnings

from zaml.common.utils import load_state

import model_engine, zaml
model_engine.__version__, zaml.__version__




('v1.13.1', '34.4.1')

In [3]:
from model_engine.model_builder.asset_parser import asset_parser
from model_engine.model_builder import ModelBuilder

# Hyperparameter tuning

In [4]:
from zaml.model.modeling import XGBoostModel
from zaml.model.model_selection.parameter_search import ParameterSearch

In [5]:
base_model_id = 'clientmodel3a'
input_target_name = 'final_DQ90_m12'

base_model_output_path = f'/d/shared/silver_projects_v2/penfed/autoindirectv1/modeling/{base_model_id}'


train_app = pd.read_parquet(os.path.join(base_model_output_path, 'train_app.parquet')).sort_values('appDate')
valid_app = pd.read_parquet(os.path.join(base_model_output_path, 'valid_app.parquet')).sort_values('appDate')


train_fe_data = pd.read_parquet(os.path.join(base_model_output_path, 'train_fe_data.parquet')).merge(train_app[['ZEST_KEY','appDate']])
train_target = pd.read_parquet(os.path.join(base_model_output_path, 'train_target.parquet'))
has_target_list = list(train_target[train_target['target'].notna()]['ZEST_KEY'])
train = train_fe_data[train_fe_data['ZEST_KEY'].isin(has_target_list)].merge(train_target).sort_values('appDate').set_index('ZEST_KEY')
train_fe_data = train.drop(columns=['target'])
train_target = train['target']


valid_fe_data = pd.read_parquet(os.path.join(base_model_output_path, 'valid_fe_data.parquet')).merge(valid_app[['ZEST_KEY','appDate']])
valid_target = pd.read_parquet(os.path.join(base_model_output_path, 'valid_target.parquet'))
has_target_list = list(valid_target[valid_target['target'].notna()]['ZEST_KEY'])
valid = valid_fe_data[valid_fe_data['ZEST_KEY'].isin(has_target_list)].merge(valid_target).sort_values('appDate').set_index('ZEST_KEY')
valid_fe_data = valid.drop(columns=['target'])
valid_target = valid['target']

In [6]:
train_app['appDate'].min(), train_app['appDate'].max()

(Timestamp('2020-07-01 00:00:00'), Timestamp('2022-06-30 00:00:00'))

In [7]:
df = train_app 
df['appDate'] = pd.to_datetime(df['appDate'])

monthly_counts = (
    df
    .groupby(df['appDate'].dt.to_period('M'))
    .size()
    .reset_index(name='count')
)
monthly_counts

Unnamed: 0,appDate,count
0,2020-07,8882
1,2020-08,8347
2,2020-09,8304
3,2020-10,10292
4,2020-11,9027
5,2020-12,8594
6,2021-01,17786
7,2021-02,18652
8,2021-03,23850
9,2021-07,49804


In [8]:
# Determine the folds and check the counts in each fold 

folds = []
fold_size = len(df) // 4

for i in range(4):
    start_idx = i * fold_size
    if i < 3:
        end_idx = (i + 1) * fold_size
    else:
        end_idx = len(df)  
    print(df.iloc[0:end_idx].shape)
    print(df.iloc[0:end_idx].appDate.min(), df.iloc[0:end_idx].appDate.max())
    folds.append(df.iloc[0:end_idx])
    

(229856, 19)
2020-07-01 00:00:00 2021-09-08 00:00:00
(459712, 19)
2020-07-01 00:00:00 2022-01-11 00:00:00
(689568, 19)
2020-07-01 00:00:00 2022-04-11 00:00:00
(919427, 19)
2020-07-01 00:00:00 2022-06-30 00:00:00


In [9]:
valid_app['appDate'].min(), valid_app['appDate'].max()

(Timestamp('2022-07-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

In [15]:
X_combined = pd.concat([train_fe_data, valid_fe_data], axis=0)
y_combined = pd.concat([train_target, valid_target], axis=0)

X_combined = X_combined.reset_index(drop=True)
y_combined = y_combined.reset_index(drop=True)


# Define date ranges for splits
date_splits = [
    ("2020-07-01", "2021-11-01", "2021-11-01", "2022-02-01"),
    ("2020-07-01", "2022-02-01", "2022-02-01", "2022-04-15"),
    ("2020-07-01", "2022-04-15", "2021-04-15", "2022-07-01"),
    ("2020-07-01", "2022-07-01", "2022-07-01", "2022-12-31")
]

custom_splits = []

for train_start, train_end, val_start, val_end in date_splits:
    # Create boolean masks for train and validation splits
    train_mask = (X_combined['appDate'] >= train_start) & (X_combined['appDate'] < train_end)
    val_mask = (X_combined['appDate'] >= val_start) & (X_combined['appDate'] < val_end)
    
    # Get indices
    train_indices = np.where(train_mask)[0]
    val_indices = np.where(val_mask)[0]
    
    custom_splits.append((train_indices, val_indices))
    

X_combined = X_combined.drop(columns=['appDate'])


In [11]:
params_space = {
    "max_depth": [3,4,5,6],
    "subsample": [0.1, 0.3, 0.5, 0.7],
    "scale_pos_weight": [2.5, 3, 4],
    "colsample_bytree": [0.05, 0.1, 0.2, 0.25],
    "min_child_weight": [50, 100, 150, 200, 250, 300, 350],
}
fit_params = {
    'n_estimators': 10000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,      # Early stopping parameter
    'eval_metric': 'auc',
    'seed': 12
}

In [13]:
columns = ['subsample','scale_pos_weight','min_child_weight','max_depth','colsample_bytree'] + ['avg_train_score','std_train_score','avg_val_score','std_val_score']

# File path for the CSV
output_file = f"./tuning_submodel_weights_psk/{base_model_id}_cv_results.csv"

# Check if the file exists
file_exists = os.path.isfile(output_file)

# Open the file for appending, writing the header only if it doesn't exist
if not file_exists:
    with open(output_file, mode='w') as f:
        pd.DataFrame(columns=columns).to_csv(f, index=False)

default_params = {'subsample': 0.5,
  'scale_pos_weight': 2.5,
  'min_child_weight': 350,
  'max_depth': 3,
  'colsample_bytree': 0.05}

# found_flg = False
# for params in random_params:
#     if params==default_params:
#         found_flg = True
#         break
# print(found_flg)
# if ~found_flg:
#     random_params.append(default_params)

NameError: name 'random_params' is not defined

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterSampler

n_iter = 200  # Number of random samples
random_params = list(ParameterSampler(params_space, n_iter=n_iter, random_state=42))

# Loop over random parameter combinations
for params in random_params:
    print(f"Testing parameters: {params}")
    train_scores = []
    val_scores = []
    
    # Temporal cross-validation
    for train_idx, val_idx in custom_splits:
        X_train, X_val = X_combined.iloc[train_idx], X_combined.iloc[val_idx]
        y_train, y_val = y_combined.iloc[train_idx], y_combined.iloc[val_idx]
        
        combined_params = {**fit_params, **params}
        
        # Train the XGBoost model with early stopping
        model = XGBoostModel(**combined_params)
        model.fit(
            X_train, y_train,
            valid_data=(X_val, y_val)
        )
        
        # Evaluate the model on validation data
        val_preds = model.predict(X_val)
        val_score = roc_auc_score(y_val, val_preds)
        val_scores.append(val_score)
        
        # Evaluate the model on training data
        train_preds = model.predict(X_train)
        train_score = roc_auc_score(y_train, train_preds)
        train_scores.append(train_score)
    
    # Calculate average and standard deviation of scores
    avg_train_score = np.mean(train_scores)
    std_train_score = np.std(train_scores)
    avg_val_score = np.mean(val_scores)
    std_val_score = np.std(val_scores)
    
    print(f"Train AUC: {avg_train_score}, Validation AUC: {avg_val_score}")
    
    row_data = {
        **params,
        'avg_train_score': avg_train_score,
        'std_train_score': std_train_score,
        'avg_val_score': avg_val_score,
        'std_val_score': std_val_score
    }
    row_df = pd.DataFrame([row_data])

    # Append the row to the CSV file
    row_df.to_csv(output_file, mode='a', index=False, header=False)

    print(f"Saved results for parameters: {params}")

Testing parameters: {'subsample': 0.5, 'scale_pos_weight': 4, 'min_child_weight': 150, 'max_depth': 6, 'colsample_bytree': 0.05}
Train AUC: 0.8906258323347278, Validation AUC: 0.8066667617405093
Saved results for parameters: {'subsample': 0.5, 'scale_pos_weight': 4, 'min_child_weight': 150, 'max_depth': 6, 'colsample_bytree': 0.05}
Testing parameters: {'subsample': 0.3, 'scale_pos_weight': 4, 'min_child_weight': 50, 'max_depth': 6, 'colsample_bytree': 0.05}
Train AUC: 0.8948327079374137, Validation AUC: 0.8089910708049153
Saved results for parameters: {'subsample': 0.3, 'scale_pos_weight': 4, 'min_child_weight': 50, 'max_depth': 6, 'colsample_bytree': 0.05}
Testing parameters: {'subsample': 0.7, 'scale_pos_weight': 2.5, 'min_child_weight': 200, 'max_depth': 6, 'colsample_bytree': 0.2}
Train AUC: 0.8811527437142684, Validation AUC: 0.8069365569643674
Saved results for parameters: {'subsample': 0.7, 'scale_pos_weight': 2.5, 'min_child_weight': 200, 'max_depth': 6, 'colsample_bytree': 0.2

In [None]:
import gc
gc.collect()

In [None]:

results = pd.read_csv(f'./tuning_submodel_weights_psk/{base_model_id}_cv_results.csv')


In [None]:
# Default hyperparameter set from LargeClientPipelineFactory

results.loc[(results['max_depth']==3)&(results['subsample']==0.5)&(results['scale_pos_weight']==2.5)&(results['colsample_bytree']==0.05)]

In [None]:
# Set of hyperparameters that give smallest difference between train and valid score 

results['avg_val_score_truncate'] = round(results['avg_val_score'], 3) 
results.sort_values(['avg_val_score_truncate','avg_train_score'], ascending=[False,True]).iloc[:60]

In [None]:
best_params = {
    "max_depth": ,
    "subsample": ,
    "scale_pos_weight": ,
    "colsample_bytree": ,
    "min_child_weight": ,
}

# Unfold Model 

In [None]:
# Update params in asset with the best_params from CV 

base_asset['config']['pipeline_factory']['model'] = {'zaml_class': 'XGBoostModel',
 'params': {'n_estimators': 10000,
  'learning_rate': 0.01,
    "max_depth": 6,
    "subsample": 0.5,
    "scale_pos_weight": 2.5,
    "colsample_bytree": 0.2,
    "min_child_weight": 100,
  'seed': 12,
  'early_stopping_rounds': 200,
  'eval_metric': 'auc'}}

# Fold Valid Model