In [25]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV


## set up pyspark session

In [26]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [27]:
def read_gold_table(table, gold_db, spark):
    """
    Helper function to read all partitions of a gold table
    """
    folder_path = os.path.join(gold_db, table)
    files_list = [os.path.join(folder_path, os.path.basename(f)) for f in glob.glob(os.path.join(folder_path, '*'))]
    df = spark.read.option("header", "true").parquet(*files_list)
    return df


In [28]:
X_spark = read_gold_table('feature_store', 'datamart/gold', spark)
y_spark = read_gold_table('label_store', 'datamart/gold', spark)

In [29]:
X_df = X_spark.toPandas().sort_values(by='Customer_ID')
y_df = y_spark.toPandas().sort_values(by='Customer_ID')

                                                                                

In [30]:
X_spark.show(5)
y_spark.show(5)

+-----------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------------+---+------------------------+-------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+------------------------+-------------------+-----------------------+---------------+-----------------------+------------------------+-------------+------------------------+------------------------+-------------------------+--------------------+------------------------+-------------------+-----------------+------------------+--------------------+-----------------+---------------

## set up config

In [31]:
# set up config
model_train_date_str = "2024-09-02"
train_test_period_months = 12
oot_period_months = 2
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_period_months"] =  oot_period_months
config["model_train_date"] =  datetime.strptime(model_train_date_str, "%Y-%m-%d").date()
config["oot_end_date"] =  config['model_train_date'] - timedelta(days = 1)
config["oot_start_date"] =  config['model_train_date'] - relativedelta(months = oot_period_months)
config["train_test_end_date"] =  config["oot_start_date"] - timedelta(days = 1)
config["train_test_start_date"] =  config["oot_start_date"] - relativedelta(months = train_test_period_months)
config["train_test_ratio"] = train_test_ratio 


pprint.pprint(config)

{'model_train_date': datetime.date(2024, 9, 2),
 'model_train_date_str': '2024-09-02',
 'oot_end_date': datetime.date(2024, 9, 1),
 'oot_period_months': 2,
 'oot_start_date': datetime.date(2024, 7, 2),
 'train_test_end_date': datetime.date(2024, 7, 1),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.date(2023, 7, 2)}


In [None]:
# User parameters
model_train_date_str = "2024-09-02"
train_test_period_months = 12
oot_months = 2  # Number of OOT periods (each 1 month)
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_months"] = oot_months
config["model_train_date"] = datetime.strptime(model_train_date_str, "%Y-%m-%d").date()
config["train_test_ratio"] = train_test_ratio

# Calculate OOT periods (each 1 month)
oot_periods = []
for i in range(oot_months, 0, -1):
    oot_start = config["model_train_date"] - relativedelta(months=i)
    oot_end = (oot_start + relativedelta(months=1)) - timedelta(days=1)
    oot_periods.append({"oot_start": oot_start, "oot_end": oot_end})

config["oot_periods"] = oot_periods

# Train/Test period: before first OOT period, for specified months
config["train_test_end_date"] = oot_periods[0]["oot_start"] - timedelta(days=1)
config["train_test_start_date"] = oot_periods[0]["oot_start"] - relativedelta(months=train_test_period_months)

pprint.pprint(config)

{'model_train_date': datetime.date(2024, 9, 2),
 'model_train_date_str': '2024-09-02',
 'oot_months': 2,
 'oot_periods': [{'oot_end': datetime.date(2024, 8, 1),
                  'oot_start': datetime.date(2024, 7, 2)},
                 {'oot_end': datetime.date(2024, 9, 1),
                  'oot_start': datetime.date(2024, 8, 2)}],
 'train_test_end_date': datetime.date(2024, 7, 1),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.date(2023, 7, 2)}


## prepare data for modeling

In [33]:
# Filter y for the modeling window
y_model_df = y_df[
    (y_df['snapshot_date'] >= config['train_test_start_date']) &
    (y_df['snapshot_date'] <= config['model_train_date'])
]

# Filter X for Customer_IDs present in y_model_df
X_model_df = X_df[X_df['Customer_ID'].isin(y_model_df['Customer_ID'].unique())]

# Create OOT splits (for each OOT period)
oot_splits = []
for oot in config["oot_periods"]:
    y_oot = y_model_df[
        (y_model_df['snapshot_date'] >= oot['oot_start']) &
        (y_model_df['snapshot_date'] <= oot['oot_end'])
    ]
    X_oot = X_model_df[X_model_df['Customer_ID'].isin(y_oot['Customer_ID'].unique())]
    oot_splits.append((X_oot, y_oot))

# Everything else goes into train-test (before the first OOT period)
y_traintest = y_model_df[y_model_df['snapshot_date'] <= config['train_test_end_date']]
X_traintest = X_model_df[X_model_df['Customer_ID'].isin(y_traintest['Customer_ID'].unique())]

In [34]:
# After filtering y_traintest, get the Customer_IDs
traintest_ids = y_traintest['Customer_ID'].unique()
# Filter X_traintest to only those IDs
X_traintest = X_traintest[X_traintest['Customer_ID'].isin(traintest_ids)]

X_traintest = X_traintest.sort_values(['Customer_ID', 'snapshot_date']).reset_index(drop=True)
y_traintest = y_traintest.sort_values(['Customer_ID', 'snapshot_date']).reset_index(drop=True)

print(X_traintest.shape, y_traintest.shape)

(5913, 77) (5913, 5)


In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X_traintest,
    y_traintest['label'],  # Only the label column!
    test_size=config['train_test_ratio'],
    random_state=611,
    shuffle=True,
    stratify=y_traintest['label']
)

In [39]:
drop_cols = ['Customer_ID', 'clickstream_snapshot_date', 'attributes_snapshot_date', 'financial_snapshot_date', 'snapshot_date']

X_train_arr = X_train.drop(columns=drop_cols, errors='ignore').values
X_test_arr = X_test.drop(columns=drop_cols, errors='ignore').values

# For multiple OOT splits (assuming oot_splits is a list of (X_oot, y_oot))
X_oot_arrs = [X_oot.drop(columns=drop_cols, errors='ignore').values for X_oot, _ in oot_splits]
y_oot_arrs = [y_oot['label'].values if 'label' in y_oot.columns else y_oot.values for _, y_oot in oot_splits]

## Train data

## train model

In [40]:
# Drop non-numeric columns before scaling
non_feature_cols = [
    'Customer_ID',
    'clickstream_snapshot_date',
    'attributes_snapshot_date',
    'financial_snapshot_date',
    'snapshot_date'
]

X_train_num = X_train.drop(columns=non_feature_cols, errors='ignore')
X_test_num = X_test.drop(columns=non_feature_cols, errors='ignore')

# For multiple OOT splits
X_oot_nums = [X_oot.drop(columns=non_feature_cols, errors='ignore') for X_oot, _ in oot_splits]

# Fit scaler on numeric data
scaler = StandardScaler()
transformer_stdscaler = scaler.fit(X_train_num)

# Transform data
X_train_processed = transformer_stdscaler.transform(X_train_num)
X_test_processed = transformer_stdscaler.transform(X_test_num)
X_oot_processeds = [transformer_stdscaler.transform(X_oot_num) for X_oot_num in X_oot_nums]

print('X_train_processed', X_train_processed.shape[0])
print('X_test_processed', X_test_processed.shape[0])
for idx, X_oot_processed in enumerate(X_oot_processeds, 1):
    print(f'X_oot_processed_{idx}', X_oot_processed.shape[0])

X_train_processed 1182
X_test_processed 4731
X_oot_processed_1 518
X_oot_processed_2 511


In [41]:
# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# Define the hyperparameter space to search
param_dist = {
    'n_estimators': [25, 50],
    'max_depth': [2, 3],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2],
    'scale_pos_weight': [1, 2, 5, 10],        
    'max_delta_step': [0, 1, 5],              
    'base_score': [0.3, 0.5, 0.7], 
}

# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring=auc_scorer,
    n_iter=100,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Perform the random search
random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)

# Evaluate the model on the train set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_train_processed)[:, 1]
train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", train_auc_score)
print("TRAIN GINI score: ", round(2*train_auc_score-1,3))

# Evaluate the model on the test set
y_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]
test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", test_auc_score)
print("Test GINI score: ", round(2*test_auc_score-1,3))

# Evaluate the model on all OOT splits
oot_auc_scores = []
for idx, (X_oot_processed, y_oot) in enumerate(zip(X_oot_processeds, y_oot_arrs), 1):
    y_pred_proba = best_model.predict_proba(X_oot_processed)[:, 1]
    auc = roc_auc_score(y_oot, y_pred_proba)
    oot_auc_scores.append(auc)
    print(f"OOT{idx} AUC score: {auc}")
    print(f"OOT{idx} GINI score: {round(2*auc-1,3)}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'subsample': 0.6, 'scale_pos_weight': 2, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 5, 'max_depth': 2, 'max_delta_step': 1, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6, 'base_score': 0.5}
Best AUC score:  0.7393188817395008
Train AUC score:  0.7956336408760761
TRAIN GINI score:  0.591
Test AUC score:  0.7505652953939315
Test GINI score:  0.501
OOT1 AUC score: 0.7355281735335644
OOT1 GINI score: 0.471
OOT2 AUC score: 0.7398479800083297
OOT2 GINI score: 0.48


In [42]:
model_artefact = {}

model_artefact['model'] = best_model
model_artefact['model_version'] = "credit_model_" + config["model_train_date_str"].replace('-', '_')
model_artefact['preprocessing_transformers'] = {'stdscaler': scaler}
model_artefact['data_dates'] = config
model_artefact['data_stats'] = {
    'X_train': X_train.shape[0],
    'X_test': X_test.shape[0],
    'y_train': round(y_train.mean(), 2),
    'y_test': round(y_test.mean(), 2),
    'X_oot': [X_oot.shape[0] for X_oot in X_oot_arrs],
    'y_oot': [round(y_oot.mean(), 2) for y_oot in y_oot_arrs]
}
model_artefact['results'] = {
    'auc_train': train_auc_score,
    'auc_test': test_auc_score,
    'auc_oot': oot_auc_scores,  # list of AUCs for each OOT
    'gini_train': round(2 * train_auc_score - 1, 3),
    'gini_test': round(2 * test_auc_score - 1, 3),
    'gini_oot': [round(2 * auc - 1, 3) for auc in oot_auc_scores]
}
model_artefact['hp_params'] = random_search.best_params_

pprint.pprint(model_artefact)

{'data_dates': {'model_train_date': datetime.date(2024, 9, 2),
                'model_train_date_str': '2024-09-02',
                'oot_months': 2,
                'oot_periods': [{'oot_end': datetime.date(2024, 8, 1),
                                 'oot_start': datetime.date(2024, 7, 2)},
                                {'oot_end': datetime.date(2024, 9, 1),
                                 'oot_start': datetime.date(2024, 8, 2)}],
                'train_test_end_date': datetime.date(2024, 7, 1),
                'train_test_period_months': 12,
                'train_test_ratio': 0.8,
                'train_test_start_date': datetime.date(2023, 7, 2)},
 'data_stats': {'X_oot': [518, 511],
                'X_test': 4731,
                'X_train': 1182,
                'y_oot': [0.28, 0.33],
                'y_test': 0.29,
                'y_train': 0.29},
 'hp_params': {'base_score': 0.5,
               'colsample_bytree': 0.6,
               'gamma': 0,
               'learning_ra

In [None]:
# Define the XGBoost classifier
# xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# # Define the hyperparameter space to search
# param_dist = {
#     'n_estimators': [25, 50],
#     'max_depth': [2, 3],
#     'learning_rate': [0.01, 0.1],
#     'subsample': [0.6, 0.8],
#     'colsample_bytree': [0.6, 0.8],
#     'gamma': [0, 0.1],
#     'min_child_weight': [1, 3, 5],
#     'reg_alpha': [0, 0.1, 1],
#     'reg_lambda': [1, 1.5, 2],
#     'scale_pos_weight': [1, 2, 5, 10],        
#     'max_delta_step': [0, 1, 5],              
#     'base_score': [0.3, 0.5, 0.7], 
# }

# # Create a scorer based on AUC score
# auc_scorer = make_scorer(roc_auc_score)

# # Set up the random search with cross-validation
# random_search = RandomizedSearchCV(
#     estimator=xgb_clf,
#     param_distributions=param_dist,
#     scoring=auc_scorer,
#     n_iter=100,  # Number of iterations for random search
#     cv=3,       # Number of folds in cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1   # Use all available cores
# )

# # Perform the random search
# random_search.fit(X_train_processed, y_train)

# # Output the best parameters and best score
# print("Best parameters found: ", random_search.best_params_)
# print("Best AUC score: ", random_search.best_score_)

# # Evaluate the model on the train set
# best_model = random_search.best_estimator_
# y_pred_proba = best_model.predict_proba(X_train_processed)[:, 1]
# train_auc_score = roc_auc_score(y_train, y_pred_proba)
# print("Train AUC score: ", train_auc_score)

# # Evaluate the model on the test set
# best_model = random_search.best_estimator_
# y_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]
# test_auc_score = roc_auc_score(y_test, y_pred_proba)
# print("Test AUC score: ", test_auc_score)

# # Evaluate the model on the oot set
# best_model = random_search.best_estimator_
# y_pred_proba = best_model.predict_proba(X_oot_processed)[:, 1]
# oot_auc_score = roc_auc_score(y_oot['label'], y_pred_proba) 
# print("OOT AUC score: ", oot_auc_score)

# print("TRAIN GINI score: ", round(2*train_auc_score-1,3))
# print("Test GINI score: ", round(2*test_auc_score-1,3))
# print("OOT GINI score: ", round(2*oot_auc_score-1,3))

In [48]:
# model_artefact = {}

# model_artefact['model'] = best_model
# model_artefact['model_version'] = "credit_model_" + config["model_train_date_str"].replace('-', '_')
# model_artefact['preprocessing_transformers'] = {}  
# model_artefact['preprocessing_transformers']['stdscaler'] = scaler
# model_artefact['data_dates'] = config
# model_artefact['data_stats'] = {}
# model_artefact['data_stats']['X_train'] = X_train.shape[0]
# model_artefact['data_stats']['X_test'] = X_test.shape[0]
# model_artefact['data_stats']['X_oot'] = X_oot.shape[0]
# model_artefact['data_stats']['y_train'] = round(y_train.mean(), 2)
# model_artefact['data_stats']['y_test'] = round(y_test.mean(), 2)
# model_artefact['data_stats']['y_oot'] = round(y_oot['label'].mean(), 2)
# model_artefact['results'] = {}
# model_artefact['results']['auc_train'] = train_auc_score
# model_artefact['results']['auc_test'] = test_auc_score
# model_artefact['results']['auc_oot'] = oot_auc_score
# model_artefact['results']['accuracy_train'] = train_accuracy
# model_artefact['results']['accuracy_test'] = test_accuracy
# model_artefact['results']['accuracy_oot'] = oot_accuracy
# model_artefact['hp_params'] = random_search.best_params_

# pprint.pprint(model_artefact)

## prepare model artefact to save

## save artefact to model bank

In [43]:
# create model_bank dir
model_bank_directory = "model_bank/"

if not os.path.exists(model_bank_directory):
    os.makedirs(model_bank_directory)

In [44]:
# Full path to the file
file_path = os.path.join(model_bank_directory, model_artefact['model_version'] + '.pkl')

# Write the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model_artefact, file)

print(f"Model saved to {file_path}")


Model saved to model_bank/credit_model_2024_09_02.pkl


## test load pickle and make model inference

In [47]:
# Load the model from the pickle file
with open(file_path, 'rb') as file:
    loaded_model_artefact = pickle.load(file)

# Take the last OOT split
X_oot_last = X_oot_arrs[0]
y_oot_last = y_oot_arrs[0]

# Predict probabilities on the last OOT set
y_pred_proba = loaded_model_artefact['model'].predict_proba(X_oot_last)[:, 1]

# If y_oot_last is an array of true labels
oot_auc_score = roc_auc_score(y_oot_last, y_pred_proba)
print("Last OOT AUC score:", oot_auc_score)

Last OOT AUC score: 0.6513742963492675
