# Lapse Drivers Jupyter Module Testing Script

In [1]:
import os
import sys
from datetime import datetime

parent_directory = os.path.dirname(os.getcwd())
data_directory = os.path.join(parent_directory,"data","raw")

sys.path.append(parent_directory)
sys.path.append(os.path.join(parent_directory,"resources","utils"))
sys.path.append(os.path.join(parent_directory,"data","raw"))

In [2]:
from resources.utils.ingest import IngestCSV

## Test CSV loader

In [3]:
survey_loader = IngestCSV(os.path.join(parent_directory,"data","raw","customer_survey.csv"), 
    known_types={'POL_NUMBER': 'string', 'MONTH_KEY': 'datetime64[ns]', 'HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT': 'Int64', 'GENERAL_FEEDBACK': 'string'},
    range_constraints={'MONTH_KEY': (datetime.strptime('2000-01-01  00:00:00', "%Y-%m-%d %H:%M:%S"),datetime.strptime('2024-08-01  00:00:00', "%Y-%m-%d %H:%M:%S")), 'HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT': (1,5)})

survey_data =survey_loader.load_csv()

lapse_loader = IngestCSV(os.path.join(parent_directory,"data","raw","lapse.csv"), 
    known_types={'POL_NUMBER': 'string', 'MONTH_KEY': 'datetime64[ns]', 'AGE': 'Int64', 'DURATION': 'Int64', 'GENDER': 'string', 'LAPSE_IN_12M': 'bool'},
    range_constraints={'MONTH_KEY': (datetime.strptime('2000-01-01  00:00:00', "%Y-%m-%d %H:%M:%S"),datetime.strptime('2024-08-01  00:00:00', "%Y-%m-%d %H:%M:%S")), 'DURATION': (1,563)})

lapse_data =lapse_loader.load_csv()

In [4]:
survey_data.head()

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,GENERAL_FEEDBACK
0,1,2000-01-01,1.0,I hated the product
1,2,2014-08-01,2.0,I hated the product
2,3,2014-08-01,4.0,
3,4,2014-08-01,2.0,
4,5,2014-08-01,5.0,Middle of the range product


In [5]:
lapse_data.head()

Unnamed: 0,POL_NUMBER,MONTH_KEY,AGE,DURATION,GENDER,LAPSE_IN_12M,RANDOMFIELD1,RANDOMFIELD2,RANDOMFIELD3,RANDOMFIELD4,RANDOMFIELD5,RANDOMFIELD6,RANDOMFIELD7,RANDOMFIELD8,RANDOMFIELD9,RANDOMFIELD10
0,1,2000-01-01,40,20.0,M,True,100.0,Apple,,,,,,,,
1,2,2014-08-01,36,,F,False,55.0,Pear,,,,,,,,
2,3,2014-08-01,20,12.0,M,False,30.0,Beat,,,,,,,,
3,4,2014-08-01,37,18.0,M,True,42.0,Apple,,,,,,,,
4,5,2014-08-01,66,563.0,M,True,,,,,,,,,,


In [6]:
print(survey_data.dtypes)
print("=============================================================")
print(lapse_data.dtypes)

POL_NUMBER                                     string[python]
MONTH_KEY                                      datetime64[ns]
HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT           float64
GENERAL_FEEDBACK                               string[python]
dtype: object
POL_NUMBER       string[python]
MONTH_KEY        datetime64[ns]
AGE                       Int64
DURATION                float64
GENDER           string[python]
LAPSE_IN_12M               bool
RANDOMFIELD1            float64
RANDOMFIELD2             object
RANDOMFIELD3            float64
RANDOMFIELD4            float64
RANDOMFIELD5            float64
RANDOMFIELD6            float64
RANDOMFIELD7            float64
RANDOMFIELD8            float64
RANDOMFIELD9            float64
RANDOMFIELD10           float64
dtype: object


## Test data preprocessing functions

In [9]:
from resources.utils.transform import DualSourceTransform

In [10]:
# set lapse dataframe parameters 
keys = ['POL_NUMBER', 'MONTH_KEY']  #Dataframe unique keys
raw_text = 'GENERAL_FEEDBACK'   # raw string fields (can be a list)
target =  'LAPSE_IN_12M'    # target column
segmentor = 'DURATION'

In [11]:
# initialise data transformation class. Class also accepts the type of merge operation to perform. Accepts 'inner', 'outer', 'left', or 'right'. Defaults to 'inner'.
lapseTransform = DualSourceTransform(survey_data,lapse_data,keys,target,raw_text) 

In [12]:
# view altered base
base = lapseTransform.get_transformed_df()
base.head(10)

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,GENERAL_FEEDBACK,AGE,DURATION,GENDER,LAPSE_IN_12M,RANDOMFIELD1,RANDOMFIELD2,RANDOMFIELD3,RANDOMFIELD4,RANDOMFIELD5,RANDOMFIELD6,RANDOMFIELD7,RANDOMFIELD8,RANDOMFIELD9,RANDOMFIELD10
0,1,2000-01-01,1.0,I hated the product,40.0,20.0,M,True,100.0,Apple,,,,,,,,
1,2,2014-08-01,2.0,I hated the product,36.0,,F,False,55.0,Pear,,,,,,,,
2,3,2014-08-01,4.0,,20.0,12.0,M,False,30.0,Beat,,,,,,,,
3,4,2014-08-01,2.0,,37.0,18.0,M,True,42.0,Apple,,,,,,,,
4,5,2014-08-01,5.0,Middle of the range product,66.0,563.0,M,True,,,,,,,,,,
5,6,2014-08-01,,,50.0,100.0,F,False,,Beat,,,,,,,,
6,7,2014-08-01,3.0,Middle of the range product,47.0,38.0,F,False,55.0,Apple,,,,,,,,
7,8,2014-08-01,4.0,,,26.0,F,False,30.0,,,,,,,,,
8,9,2014-08-01,5.0,I liked the product,24.0,563.0,M,True,42.0,Beat,,,,,,,,
9,10,2014-08-01,1.0,I liked the product,55.0,42.0,F,False,255.0,Pear,,,,,,,,


In [13]:
# drop columns and rows (apart from the set dataframe parameters) that maintain more than the threshold worth of NULLs
lapseTransform.drop_sparse_data()   # can provide function with a float value to set threshold e.g. 0.75

In [14]:
# generate features from raw text fields
lapseTransform.apply_sentiment_analyzer()

In [15]:
# view altered base
base = lapseTransform.get_transformed_df()
base.head(10)

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,GENERAL_FEEDBACK,AGE,DURATION,GENDER,LAPSE_IN_12M,RANDOMFIELD1,RANDOMFIELD2,sentiment_score
0,1,2000-01-01,1.0,I hated the product,40,20.0,M,True,100.0,Apple,-0.6369
1,2,2014-08-01,2.0,I hated the product,36,,F,False,55.0,Pear,-0.6369
2,3,2014-08-01,4.0,,20,12.0,M,False,30.0,Beat,0.0
3,4,2014-08-01,2.0,,37,18.0,M,True,42.0,Apple,0.0
4,5,2014-08-01,5.0,Middle of the range product,66,563.0,M,True,,,0.0
6,7,2014-08-01,3.0,Middle of the range product,47,38.0,F,False,55.0,Apple,0.0
8,9,2014-08-01,5.0,I liked the product,24,563.0,M,True,42.0,Beat,0.4215
9,10,2014-08-01,1.0,I liked the product,55,42.0,F,False,255.0,Pear,0.4215
11,12,2014-08-01,3.0,Middle of the range product,47,38.0,F,False,55.0,Apple,0.0
13,14,2014-08-01,5.0,I liked the product,24,563.0,M,True,42.0,Beat,0.4215


In [16]:
lapseTransform.drop_column('GENERAL_FEEDBACK')

In [17]:
# impute missing values. Function accepts optional segmentation field to improve numeric imputation.
lapseTransform.impute(segmentor)

In [18]:
# view altered base
base = lapseTransform.get_transformed_df()
base.head(10)

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,AGE,DURATION,GENDER,LAPSE_IN_12M,RANDOMFIELD1,RANDOMFIELD2,sentiment_score
0,1,2000-01-01,1.0,40,20.0,M,True,100.0,Apple,-0.6369
1,2,2014-08-01,2.0,36,38.0,F,False,55.0,Pear,-0.6369
2,3,2014-08-01,4.0,20,12.0,M,False,30.0,Beat,0.0
3,4,2014-08-01,2.0,37,18.0,M,True,42.0,Apple,0.0
4,5,2014-08-01,5.0,66,563.0,M,True,42.0,unknown,0.0
6,7,2014-08-01,3.0,47,38.0,F,False,55.0,Apple,0.0
8,9,2014-08-01,5.0,24,563.0,M,True,42.0,Beat,0.4215
9,10,2014-08-01,1.0,55,42.0,F,False,255.0,Pear,0.4215
11,12,2014-08-01,3.0,47,38.0,F,False,55.0,Apple,0.0
13,14,2014-08-01,5.0,24,563.0,M,True,42.0,Beat,0.4215


In [19]:
# standardize and encode categorical fields
lapseTransform.encoder()

In [20]:
# view altered base
base = lapseTransform.get_transformed_df()
base.head(10)

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,AGE,DURATION,LAPSE_IN_12M,RANDOMFIELD1,sentiment_score,GENDER_f,GENDER_m,RANDOMFIELD2_apple,RANDOMFIELD2_beat,RANDOMFIELD2_pear,RANDOMFIELD2_unknown
0,1,2000-01-01,1.0,40,20.0,True,100.0,-0.6369,False,True,True,False,False,False
1,2,2014-08-01,2.0,36,38.0,False,55.0,-0.6369,True,False,False,False,True,False
2,3,2014-08-01,4.0,20,12.0,False,30.0,0.0,False,True,False,True,False,False
3,4,2014-08-01,2.0,37,18.0,True,42.0,0.0,False,True,True,False,False,False
4,5,2014-08-01,5.0,66,563.0,True,42.0,0.0,False,True,False,False,False,True
6,7,2014-08-01,3.0,47,38.0,False,55.0,0.0,True,False,True,False,False,False
8,9,2014-08-01,5.0,24,563.0,True,42.0,0.4215,False,True,False,True,False,False
9,10,2014-08-01,1.0,55,42.0,False,255.0,0.4215,True,False,False,False,True,False
11,12,2014-08-01,3.0,47,38.0,False,55.0,0.0,True,False,True,False,False,False
13,14,2014-08-01,5.0,24,563.0,True,42.0,0.4215,False,True,False,True,False,False


In [21]:
# scale numeric fields
lapseTransform.scaler()

In [22]:
# view altered base
base = lapseTransform.get_transformed_df()
base.head(10)

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,AGE,DURATION,GENDER,LAPSE_IN_12M,RANDOMFIELD1,RANDOMFIELD2,sentiment_score
0,1,1999-08-01,0.0,0.434783,0.125096,1.0,True,0.311111,0.0,0.0
1,2,2014-08-01,0.4,0.347826,0.0,0.0,False,0.111111,0.666667,0.0
2,3,2014-08-01,0.8,0.0,0.0,1.0,False,0.0,0.333333,0.601757
3,4,2014-08-01,0.4,0.369565,0.098989,1.0,True,0.053333,0.0,0.601757
4,5,2014-08-01,1.0,1.0,0.986643,1.0,True,,1.0,0.601757
6,7,2014-08-01,0.6,0.586957,0.286571,0.0,False,0.111111,0.0,0.601757
8,9,2014-08-01,1.0,0.086957,1.0,1.0,True,0.053333,0.333333,1.0
9,10,2014-08-01,0.2,0.76087,0.31204,0.0,False,1.0,0.666667,1.0
11,12,2014-08-01,0.6,0.586957,0.286571,0.0,False,0.111111,0.0,0.601757
13,14,2014-08-01,1.0,0.086957,1.0,1.0,True,0.053333,0.333333,1.0


In [None]:
# save processed data
lapseTransform.store_data(os.path.join(parent_directory,"data","processed","processed_lapse.csv"))

## Test model pipeline ETL

In [22]:
from resources.utils.model import HistGBMPipeline
from resources.utils.report import FeatureImportanceAnalyzer

In [23]:

# split off scoring set if requested and month key has been provided. Alter first argument if another field is used for by date splitting
# note if ran twice it will be empty
score_df = lapseTransform.split_by_date('MONTH_KEY', '2014-09-01')

In [29]:
# view score set
score_df.head()

Unnamed: 0,POL_NUMBER,MONTH_KEY,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,AGE,DURATION,LAPSE_IN_12M,RANDOMFIELD1,sentiment_score,GENDER_f,GENDER_m,RANDOMFIELD2_apple,RANDOMFIELD2_beat,RANDOMFIELD2_pear,RANDOMFIELD2_unknown


In [24]:
# perform a train and test split 
x_train, x_test, y_train, y_test, features = lapseTransform.split_data()

## Test Histogram GBM training pipeline

In [27]:
# initialise HistGBM modelling pipeline.
lapseHistGBM = HistGBMPipeline(x_train, x_test, y_train, y_test, features, target)

In [24]:
# Optimize and train HistGBM model.
lapseHistGBM.HistGBM_optimize_and_train()

Unnamed: 0,HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT,AGE,DURATION,GENDER,RANDOMFIELD1,RANDOMFIELD2,sentiment_score
18,1.0,0.086957,1.0,1.0,0.053333,0.333333,1.0
6,0.6,0.586957,0.286571,0.0,0.111111,0.0,0.601757
11,0.6,0.586957,0.286571,0.0,0.111111,0.0,0.601757
2,0.8,0.0,0.0,1.0,0.0,0.333333,0.601757
1,0.4,0.347826,0.0,0.0,0.111111,0.666667,0.0


## Test model validation modules

In [None]:
# validate trained model
lapseHistGBM.validate_performance()                  

In [None]:
# plot performance
lapseHistGBM.plot_performance(os.path.join(parent_directory,"reports","figures","lapseFeatureImportanceGraph.png"))

## Test model pickling capability

In [None]:
# save trained model
lapseHistGBM.save_model(os.path.join(parent_directory,"models","Lapse_HistGBM.pkl"))

In [None]:
# load model   
lapseHistGBM.load_model(os.path.join(parent_directory,"models","Lapse_HistGBM.pkl"))

## Test scoring

In [25]:
# remove keys and target (should be empty) from scoring dataframe
remove_col = keys + [target]
x_score = score_df.drop(columns=remove_col)
# produce event predictions using trained model
y_pred = lapseHistGBM.score(x_score)

13     True
16    False
0      True
Name: LAPSE_IN_12M, dtype: bool

## Test Feature Importance Analysis functions

In [26]:
# extract model
lapseHistGBMModel = lapseHistGBM.get_model()

['HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT',
 'AGE',
 'DURATION',
 'GENDER',
 'RANDOMFIELD1',
 'RANDOMFIELD2',
 'sentiment_score']

In [None]:
# intialise feature importance analyzer class
lapseFeatureAnalyzer = FeatureImportanceAnalyzer(lapseHistGBMModel, x_train, x_test, x_score, y_test, y_pred)

In [None]:
# run feature importance functions. Comment out any function not required.
lapseFeatureAnalyzer.compute_permutation_importance()

In [None]:
 # compute shap values
lapseFeatureAnalyzer.compute_tree_shap_values()

In [None]:
# generate shap analysis graph for set number of the most important variables. 
lapseFeatureAnalyzer.shap_analysis(os.path.join(parent_directory,"reports","figures","lapse_HistGBM_shapGraph.png"), 10)

In [None]:
# compute LIME importance values
lapseFeatureAnalyzer.compute_lime_importance()

In [None]:
# generate lime analysis for a sample (choose index of sample to be tested)
lapseFeatureAnalyzer.lime_analysis(0, os.path.join(parent_directory,"reports","figures","lapse_HistGBM_LIMEReport.html"))

In [None]:
# generate feature importance summary graph
lapseFeatureAnalyzer.generate_graphs(os.path.join(parent_directory,"reports","figures","lapseFeatureImportanceGraph.png"))

In [27]:
  
# generate aggeregated feature importance .csv report
lapseFeatureAnalyzer.export_csv(os.path.join(parent_directory,"reports","lapseFeatureImportanceSummary.csv"))

In [None]:
SHAP values are based on cooperative game theory and provide a way to fairly attribute the contribution of each feature to the prediction of individual instances. It considers all possible combinations of features to calculate their marginal contributions.

In [None]:
Provides a consistent and unbiased measure of feature importance.
Can capture complex non-linear interactions between features.

In [None]:
2. Permutation Feature Importance
Definition:
This method measures the change in the model's performance (e.g., accuracy, AUC) when a single feature's values are randomly shuffled. The idea is that if a feature is important, shuffling its values should significantly degrade the model's performance.
Advantages:
Provides a more reliable estimate of the true importance of a feature by considering the effect of feature importance on the model's prediction accuracy.
Captures complex interactions between features that other importance measures might miss.
Disadvantages:
More computationally expensive since it requires retraining or re-evaluating the model multiple times.

In [None]:
4. LIME (Local Interpretable Model-agnostic Explanations)
Definition:
LIME approximates the model locally around a specific instance by fitting a simpler interpretable model (like linear regression). It shows how features contribute to the model's decision for individual predictions.
Advantages:
Provides local interpretability for specific predictions.
Helps understand how the model behaves on individual instances and what drives each prediction.

In [None]:
Many automated tools, such as Sphinx, can generate documentation from docstrings, making them an integral part of creating comprehensive project documentation.