In [1]:
# Colab was used for this notebook hence the library installs
%%capture
!pip install pycaret --quiet --upgrade
!pip install dabl --quiet --upgrade
!pip install shap --quiet --upgrade
!pip install sdv  --quiet --upgrade
!pip install sdv[ctgan] --quiet --upgrade
!pip install baytune --quiet --upgrade
!pip install optuna --quiet --upgrade

UsageError: Line magic function `%%capture` not found.


# **Libraries**

In [2]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
#import dabl as db # Summary plot
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
from btb.tuning import Tunable, GCPTuner # CopulaGAN optimising
from btb.tuning import hyperparams as hp  # Set hyperparameters for optimising
import joblib # Saving preparation steps

# **Importing data**

In [3]:
# Read and output the top 5 rows
hr_data = pd.read_csv("data/HR Employee Attrition.csv")
hr_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


# **First modelling w/original data**
I'll use `Pycaret` to model the data given it's easy of use and speed. It can handle a lot of the preprocessing, modelling, evaluation and interpretation. Given the dataset is ~1.5k rows the processing isn't too intensive but if it increased in size thankfully Pycaret has GPU functionality.

Ordinal features need to have their levels known and ordered using a dictionary which can quickly be done.

In [4]:
ord_levels = ['StockOptionLevel', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction', 
              'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance']

# Print unique values of the ordinal features
for feat in ord_levels:
  print(feat, hr_data[feat].unique())

StockOptionLevel [0 1 3 2]
EnvironmentSatisfaction [2 3 4 1]
JobInvolvement [3 2 4 1]
JobSatisfaction [4 2 3 1]
Education [2 1 4 3 5]
PerformanceRating [3 4]
RelationshipSatisfaction [1 4 2 3]
WorkLifeBalance [1 3 2 4]


In [5]:
# Target feature
target = "Attrition"

# Continuous/numeric features
cont_feats = ["DistanceFromHome", "HourlyRate",  "DailyRate", "MonthlyIncome",
              "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
              "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
              "YearsWithCurrManager", "TrainingTimesLastYear", "YearsSinceLastPromotion"]

# Ordinal features
ord_feats = {"StockOptionLevel" : ["0", "1", "2", "3"],
             "EnvironmentSatisfaction" : ["1", "2", "3", "4"],
             "JobInvolvement" : ["1", "2", "3", "4"],
             "JobSatisfaction" : ["1", "2", "3", "4"],
             "Education" : ["1", "2", "3", "4", "5"],
             "PerformanceRating" : ["3", "4"],
             "RelationshipSatisfaction" : ["1", "2", "3", "4"],
             "WorkLifeBalance" : ["1", "2", "3", "4"]}

real_ord_feats = {"StockOptionLevel" : [0, 1, 2, 3],
             "EnvironmentSatisfaction" : [1, 2, 3, 4],
             "JobInvolvement" : [1, 2, 3, 4],
             "JobSatisfaction" : [1, 2, 3, 4],
             "Education" : [1, 2, 3, 4, 5],
             "PerformanceRating" : [3, 4],
             "RelationshipSatisfaction" : [1, 2, 3, 4],
             "WorkLifeBalance" : [1, 2, 3, 4]}

# Categorical geatures
cat_feats = ["BusinessTravel", "Department", "EducationField", 
             "JobRole", "Gender", "JobLevel", "JobRole", 
             "MaritalStatus", "OverTime", "WorkLifeBalance"]

# Features to ignore
ignore = ["EmployeeNumber", "StandardHours", "EmployeeCount", "Over18"]

Now with the features initalised they can be fed into pycaret's `setup` function involving steps...
* Feed in data + features
* Normalize continuous features + use `minimax` as the normalization method
* Use stratified k folding on the data
* Remove features with low variance
* Split the data into 70:30 train:test

There a lot on offer but not everything is used here.

In [259]:
# Run pycaret setup
setup(hr_data, 
      target = target,
      train_size = 0.7, 
      fold_strategy = "stratifiedkfold",
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      data_split_stratify = True,
      ignore_low_variance = True,
      silent = True)

Unnamed: 0,Description,Value
0,session_id,5560
1,Target,Attrition
2,Target Type,Binary
3,Label Encoded,"No: 0, Yes: 1"
4,Original Data,"(1470, 35)"
5,Missing Values,False
6,Numeric Features,14
7,Categorical Features,16
8,Ordinal Features,True
9,High Cardinality Features,False


(StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
 [],
            Age  DailyRate  DistanceFromHome  Education  \
 368   0.373268  -0.546058          1.552754        1.0   
 774   2.025334  -0.872535         -0.906959        0.0   
 60   -0.507833  -0.914902         -1.029945        2.0   
 128  -1.609210  -0.498707         -0.906959        0.0   
 1134 -0.177420   1.382895         -0.292031        1.0   
 ...        ...        ...               ...        ...   
 705   0.263131   0.271379         -0.906959        4.0   
 1280  0.042855   1.108755         -0.169045        1.0   
 1175  0.263131  -0.752910          0.322897        2.0   
 1061 -1.388935   0.089450          0.445883        1.0   
 1069 -0.948384   1.567317         -1.029945        2.0   
 
       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobSatisfaction  \
 368                       2.0    0.099514             1.0              2.0   
 774                       2.0   -1.288285             1.0    

Now it's "setup", I like to use `compare_models` to get a rough idea as to which model(s) might be best for this particular use case. The particular metrics I'm looking at are `AUC`, `Precision` and `Recall`. `Recall` is most important as we want to make sure we are correctly identifying the **true negatives** i.e. those that might leave in 6 months. Cross validation is also used.

In [260]:
compare_models(sort = "AUC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8813,0.8429,0.4401,0.7181,0.539,0.4767,0.4986,0.616
lda,Linear Discriminant Analysis,0.8793,0.8334,0.4581,0.7038,0.5426,0.4786,0.4988,0.013
lightgbm,Light Gradient Boosting Machine,0.8648,0.8158,0.3191,0.673,0.4288,0.3639,0.3982,0.038
ada,Ada Boost Classifier,0.8667,0.8139,0.4107,0.6223,0.4921,0.4207,0.4335,0.039
gbc,Gradient Boosting Classifier,0.8658,0.8108,0.2956,0.7073,0.4104,0.3502,0.3954,0.064
et,Extra Trees Classifier,0.8502,0.8063,0.1636,0.5645,0.2461,0.1998,0.2451,0.126
rf,Random Forest Classifier,0.8531,0.8006,0.1393,0.6867,0.2285,0.1901,0.2668,0.127
nb,Naive Bayes,0.602,0.7488,0.732,0.251,0.3727,0.1744,0.2287,0.014
knn,K Neighbors Classifier,0.8511,0.6841,0.1566,0.685,0.2533,0.2048,0.276,0.306
qda,Quadratic Discriminant Analysis,0.3493,0.6321,0.9515,0.1943,0.322,0.0736,0.1715,0.013


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5560, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

So `Logistic Regression` is looking promising for `AUC` but `Recall` is not high, regardless I'll build a model on this. As shown below each fold's metrics are shown and the average from them.

In [261]:
lr = create_model("lr") # Create logistic regression model

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9029,0.8614,0.5,0.8,0.6154,0.5632,0.5836
1,0.8641,0.8614,0.3125,0.625,0.4167,0.3493,0.3763
2,0.8738,0.7695,0.3529,0.75,0.48,0.4186,0.4573
3,0.835,0.8755,0.2353,0.5,0.32,0.2397,0.2618
4,0.8835,0.8605,0.4706,0.7273,0.5714,0.5076,0.5237
5,0.932,0.9405,0.6471,0.9167,0.7586,0.7204,0.7352
6,0.8835,0.7394,0.3529,0.8571,0.5,0.4467,0.5034
7,0.9029,0.8003,0.5294,0.8182,0.6429,0.5896,0.6084
8,0.8725,0.8968,0.5,0.6154,0.5517,0.4784,0.4819
9,0.8627,0.8241,0.5,0.5714,0.5333,0.4533,0.4547


Some hyperparameter tuning may help improve the model a bit so I'll use `tune_model` and focus on `AUC`. I'm using the `optuna` library and searching using `tpe` searching algorithm.

Overall there doesn't seem like much improvement.

In [262]:
tune_model(lr, optimize = "AUC", n_iter = 30, search_library = "optuna", search_algorithm = "tpe")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9029,0.8628,0.5,0.8,0.6154,0.5632,0.5836
1,0.8738,0.8614,0.3125,0.7143,0.4348,0.3758,0.4167
2,0.8641,0.7613,0.2941,0.7143,0.4167,0.3545,0.3995
3,0.8544,0.8803,0.2353,0.6667,0.3478,0.2864,0.3361
4,0.8835,0.868,0.4118,0.7778,0.5385,0.4789,0.5107
5,0.9126,0.9405,0.5294,0.9,0.6667,0.6202,0.6492
6,0.8835,0.7462,0.2941,1.0,0.4545,0.4103,0.508
7,0.9029,0.803,0.4706,0.8889,0.6154,0.5658,0.6033
8,0.8922,0.9055,0.4375,0.7778,0.56,0.504,0.5311
9,0.8627,0.8256,0.4375,0.5833,0.5,0.4223,0.4282


LogisticRegression(C=0.28063138415196753, class_weight={}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5560, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# **How to potentially solve this problem**
So the `lr` model isn't the best. Hyperparameter tuning wasn't great.

More data seems to be preached a lot but what if you can't? **GENERATE IT**.

The problem is making sure the data generated shares similar characteristics as the original data. Thankfully this can be evaluated.

I've decided to generate synthetic data only for people that have left in the last 6 months i.e. `Attrition == "Yes"` to lessen the class imbalance.

# **Generating synthetic data**
There's a useful library `sdev` [Synthetic Data Generation](https://github.com/sdv-dev/SDV) to generate synthetic tabular data. I've played around with it specifically the `CopulaGAN` model.

The general idea is to provide a `primary key` and to then run the model for so many `epochs`. There are hyperparameters to tune
* `epochs`
* `batch_size`
* `embedding_dim`
* `gen_dim`
* `dis_dim`
* `l2scale` (omitted in this experiment)



Rather than running a CopulaGAN without hyperparameters or guessing hyperparameters I've leveraged `BTB` [Bayesian Tuning and Bandits](https://github.com/MLBazaar/BTB) which is an optimisation library. To start I'll initialise the `GCPTuner` which is used to declare the hyperparameters to tune. Then the hyperparameters can be called using `propose` from BTB.

In [6]:
tuner = GCPTuner(Tunable({
          'epochs': hp.IntHyperParam(min = 80, max = 400),
          'batch_size' : hp.IntHyperParam(min = 1, max = 100),
          'embedding_dim' : hp.IntHyperParam(min = 1, max = 100),
          'gen' : hp.IntHyperParam(min = 1, max = 1000),
          'dim_gen' : hp.IntHyperParam(min = 1, max = 1000)
        }))

With the tuner ready it's as simple as creating a training loop and then calling the tuner within the loop and feeding the hyperparameters into the CopulaGAN.

In [None]:
best_score = 0 # Keep track of best score
tracker = 0 # Keep track of how many loops have completed

real = hr_data[hr_data["Attrition"] == "Yes"] # Filter to only those employees that left

## TRAINING LOOP START ##
for _ in range(10):

  # Increment the tracker
    tracker += 1

  # Every 5 loops output the tracker
    if tracker % 1 == 0:
        print(tracker)

  # Get the hyperparameters for this loop
    proposal = tuner.propose(1)
  
  # Create the CopulaGAN
  # NOTE - batch_size is multiplied by 10 as needs to be a factor of 10
    model = CopulaGAN(primary_key = "EmployeeNumber", 
                    embedding_dim = proposal['embedding_dim'],
                    generator_dim = (proposal['gen'], proposal['gen']),
                    discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
                    batch_size = proposal['batch_size'] * 10,
                    epochs = proposal['epochs'])
  
  # Fit the CopulaGAN
    model.fit(real)
  
  # Create 600 rows of data
    synth_data = model.sample(600, max_retries = 300)
  
  # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data = synth_data, real_data = real)

  # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        best_params = proposal
        best_score = score

  # Record the hyperparameters and score      
    tuner.record(proposal, score)

## TRAINING LOOP END ##


print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)

1


In [265]:
model.save('best_copula.pkl')

In [266]:
synth_data.to_csv("synth_data.csv", index = False)

So after 75 loops the overall score is 0.81 (this score is between 0 and 1) for 600 synthetic examples of Attrition = "Yes" so it's not too bad.

```
Best score obtained:  0.8127129012858366
Best parameters:  {'epochs': 208, 'batch_size': 42, 'embedding_dim': 10, 'gen': 54, 'dim_gen': 788}
```

# **Setting up data for round 2**
The real dataset can be split into 60:40 training/test. Once this has been split the synthetic data can be added to the training data. This in a way doesn't "waste" the real dataset in the training.

In [267]:
# Load synth data in
synth_data = pd.read_csv("synth_data.csv")

# Split real data into training + test set
train, test, target_train, target_test = train_test_split(hr_data.drop("Attrition", axis = 1), hr_data["Attrition"], test_size = 0.4, random_state = 42)

# Add Attrition column back into training + test set
train["Attrition"] = target_train
test["Attrition"] = target_test

# Add the 600 synthetic rows of data to training data + remove dataset column
input = pd.concat([train, synth_data])

In [268]:
print("Training size (real):", train.shape)
print("Training size (fake + real):", input.shape)
print("Testing size (real:", test.shape)

Training size (real): (882, 35)
Training size (fake + real): (1482, 35)
Testing size (real: (588, 35)


In [269]:
for feat in list(ord_feats.keys()):
  print(feat, input[feat].unique())

StockOptionLevel [0 1 2 3 4]
EnvironmentSatisfaction [2 3 1 4 5]
JobInvolvement [3 2 4 1]
JobSatisfaction [4 2 3 1 5]
Education [4 2 3 1 5]
PerformanceRating [3 4 5]
RelationshipSatisfaction [4 2 1 3 5 0]
WorkLifeBalance [2 3 4 1]


In [270]:
#ord_feats = {"StockOptionLevel" : ["0", "1", "2", "3", "4"],
#             "EnvironmentSatisfaction" : ["0", "1", "2", "3", "4", "5"],
#             "JobInvolvement" : ["1", "2", "3", "4"],
#             "JobSatisfaction" : ["1", "2", "3", "4", "5"],
#             "Education" : ["1", "2", "3", "4", "5"],
#             "PerformanceRating" : ["3", "4", "5"],
#             "RelationshipSatisfaction" : ["1", "2", "3", "4", "5"],
#             "WorkLifeBalance" : ["1", "2", "3", "4"]}
ord_feats={feat:[str(num) for num in input[feat].unique()] for feat in ord_feats.keys()}
ord_feats

{'StockOptionLevel': ['0', '1', '2', '3', '4'],
 'EnvironmentSatisfaction': ['2', '3', '1', '4', '5'],
 'JobInvolvement': ['3', '2', '4', '1'],
 'JobSatisfaction': ['4', '2', '3', '1', '5'],
 'Education': ['4', '2', '3', '1', '5'],
 'PerformanceRating': ['3', '4', '5'],
 'RelationshipSatisfaction': ['4', '2', '1', '3', '5', '0'],
 'WorkLifeBalance': ['2', '3', '4', '1']}

Now this data can be fed into a new `setup` but this time declaring `test_data` as the test data that was created earlier. 

So there is a combination of **synthetic and real data in the training set** but only real in the test data.

In [271]:
# Run pycaret setup w/synthetic data
setup(input, 
      target = target, 
      test_data = test,
      fold_strategy = "stratifiedkfold",
      train_size = 0.7, 
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      data_split_stratify = True,
      ignore_low_variance = True,
      silent = True, verbose = False)

(StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
 [],
            Age  DailyRate  DistanceFromHome  Education  \
 1041 -0.633289   0.120282         -0.488997        2.0   
 184   1.814124   0.636110          0.466748        1.0   
 1222 -1.024875  -1.360947          1.541960        3.0   
 67    1.030952   1.239486         -0.250061        2.0   
 220   0.149883   1.374359         -0.488997        1.0   
 ...        ...        ...               ...        ...   
 1009  2.303607   0.567491         -0.966869        2.0   
 757  -0.045910  -1.417735         -0.966869        0.0   
 1361 -0.829082   0.664504         -0.369529        2.0   
 376   1.618331   0.858531          0.586216        1.0   
 314   0.443573  -1.651987          0.108344        3.0   
 
       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobSatisfaction  \
 1041                      3.0    1.130787             0.0              3.0   
 184                       3.0   -0.100131             2.0    

Now setup the `compare_models` can be checked again. Interestingly the `AUC`, `Recall` and `Precision` all went up considerably. `Recall` nearly doubled!

In [272]:
compare_models(sort = "AUC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8761,0.9461,0.8339,0.9148,0.8524,0.7536,0.7702,0.086
rf,Random Forest Classifier,0.8674,0.9424,0.8392,0.8944,0.8455,0.736,0.753,0.148
lightgbm,Light Gradient Boosting Machine,0.8613,0.9395,0.826,0.8944,0.8378,0.7239,0.7407,0.04
et,Extra Trees Classifier,0.8599,0.9333,0.8261,0.8931,0.8345,0.7213,0.7396,0.136
ada,Ada Boost Classifier,0.841,0.9228,0.8231,0.8562,0.8293,0.6827,0.6923,0.043
lr,Logistic Regression,0.816,0.9013,0.8019,0.8316,0.8094,0.6323,0.6402,0.689
lda,Linear Discriminant Analysis,0.8146,0.8988,0.7899,0.8369,0.8064,0.6297,0.6369,0.012
qda,Quadratic Discriminant Analysis,0.7797,0.8601,0.6277,0.865,0.6967,0.5637,0.5812,0.013
knn,K Neighbors Classifier,0.7486,0.8365,0.601,0.8592,0.6719,0.5012,0.5272,0.294
nb,Naive Bayes,0.6823,0.8279,0.4609,0.8613,0.5793,0.3709,0.4267,0.011


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8650, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

A gradient boosting classifier was decided upon.

In [273]:
gbc = create_model("gbc") # Create the gradient boosting classifier

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6309,0.7547,0.3289,0.8621,0.4762,0.2707,0.3462
1,0.698,0.8472,0.4342,0.9429,0.5946,0.4024,0.4797
2,0.9257,0.9792,0.9067,0.9444,0.9252,0.8514,0.8521
3,0.9459,0.9881,0.9733,0.9241,0.9481,0.8918,0.8931
4,0.9324,0.9865,0.92,0.9452,0.9324,0.8649,0.8652
5,0.9189,0.9805,0.9467,0.8987,0.9221,0.8377,0.8389
6,0.9054,0.9724,0.9737,0.8605,0.9136,0.81,0.8175
7,0.9392,0.9828,0.9474,0.9351,0.9412,0.8782,0.8783
8,0.9324,0.9826,0.9474,0.9231,0.9351,0.8647,0.865
9,0.9324,0.9867,0.9605,0.9125,0.9359,0.8646,0.8658


# **Generating synthetic data without tuning, doing the same thing**
There's a useful library `sdev` [Synthetic Data Generation](https://github.com/sdv-dev/SDV) to generate synthetic tabular data. I've played around with it specifically the `CopulaGAN` model.

The general idea is to provide a `primary key` and to then run the model for so many `epochs`. There are hyperparameters to tune
* `epochs`
* `batch_size`
* `embedding_dim`
* `gen_dim`
* `dis_dim`
* `l2scale` (omitted in this experiment)



Rather than running a CopulaGAN without hyperparameters or guessing hyperparameters I've leveraged `BTB` [Bayesian Tuning and Bandits](https://github.com/MLBazaar/BTB) which is an optimisation library. To start I'll initialise the `GCPTuner` which is used to declare the hyperparameters to tune. Then the hyperparameters can be called using `propose` from BTB.

In [274]:
import sdv.constraints as cons
def MAX(a,b): return (abs(a-b)+(a+b))//2

def MIN(a,b): return (-abs(a-b)+(a+b))//2

def over_18(data): return MAX(data["Age"],18)

age_constraint = cons.ColumnFormula(column = 'Age',
                                    formula = over_18,
                                    handling_strategy = 'reject_sampling')

def ord_feats_valid(feat):
    def foo(data):
        return MIN(MAX(data[feat], max(int(ord_feat[feat]))), min(int(ord_feat[feat]))) 
    return foo

ord_feats_funcs = {feat : lambda data : MIN(MAX(data[feat], int(max(ord_feats[feat]))), int(min(ord_feats[feat])))  for feat in ord_feats.keys()}

ord_feats_funcs2 = {feat : ord_feats_valid(feat) for feat in ord_feats.keys()}

ord_feats_constraints = [cons.ColumnFormula(column = feat,
                                            formula = ord_feats_funcs[feat],
                                            handling_strategy = 'reject_sampling') for feat in ord_feats.keys()]

#ord_feats_constraints = [cons.ColumnFormula(column = "StockOptionLevel",
#                                            formula = ord_feats_funcs2["StockOptionLevel"],
#                                            handling_strategy = 'reject_sampling')]

ord_feats_funcs2["StockOptionLevel"], over_18

(<function __main__.ord_feats_valid.<locals>.foo(data)>,
 <function __main__.over_18(data)>)

In [275]:
best_score = 0 # Keep track of best score
tracker = 0 # Keep track of how many loops have completed
best_data = None

real = hr_data[hr_data["Attrition"] == "Yes"] # Filter to only those employees that left

## TRAINING LOOP START ##
for _ in range(5):

  # Increment the tracker
    tracker += 1

  # Every 5 loops output the tracker
    if tracker % 1 == 0:
        print(tracker)

  # Get the hyperparameters for this loop
    #proposal = tuner.propose(1)
  
  # Create the CopulaGAN
  # NOTE - batch_size is multiplied by 10 as needs to be a factor of 10
    #model = CopulaGAN(primary_key = "EmployeeNumber", 
    #                embedding_dim = proposal['embedding_dim'],
    #                generator_dim = (proposal['gen'], proposal['gen']),
    #                discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
    #                batch_size = proposal['batch_size'] * 10,
    #                epochs = proposal['epochs'],
    #                constraints = ord_feats_constraints+[age_constraint])
    #model = CopulaGAN(constraints = ord_feats_constraints+[age_constraint])
    model = CopulaGAN()
  
  # Fit the CopulaGAN
    model.fit(real)
  
  # Create 600 rows of data
    synth_data = model.sample(600, max_retries = 300)
    #print(synth_data)
  
  # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data = synth_data, real_data = real)

  # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        #best_params = proposal
        best_score = score
        best_data = synth_data

  # Record the hyperparameters and score      
  #  tuner.record(proposal, score)

## TRAINING LOOP END ##


print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)

1
2
3
4
5
Best score obtained:  0.4941354249535848
Best parameters:  {'epochs': 330, 'batch_size': 48, 'embedding_dim': 94, 'gen': 62, 'dim_gen': 675}


In [276]:
synth_data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,32,Yes,Travel_Rarely,1333,Research & Development,27,5,Human Resources,1,2038,...,3,80,0,3,2,1,-2,25,1,0
1,38,Yes,Travel_Rarely,1344,Human Resources,5,3,Life Sciences,1,1699,...,2,80,0,4,6,3,2,7,1,7
2,31,Yes,Travel_Rarely,1494,Research & Development,0,4,Life Sciences,1,1943,...,5,80,0,4,2,3,6,2,0,3
3,16,Yes,Travel_Rarely,1203,Sales,17,3,Life Sciences,1,2049,...,4,80,0,32,2,4,11,4,4,0
4,35,Yes,Travel_Rarely,1180,Research & Development,27,4,Human Resources,1,2052,...,3,80,0,19,3,2,2,6,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,14,Yes,Travel_Rarely,1478,Research & Development,24,4,Medical,1,2049,...,4,80,0,12,2,1,7,3,0,4
596,12,Yes,Travel_Rarely,1470,Research & Development,24,2,Medical,1,2018,...,2,80,0,23,3,2,4,2,0,10
597,24,Yes,Travel_Rarely,984,Research & Development,11,1,Other,1,1940,...,1,80,0,14,2,1,2,2,2,5
598,33,Yes,Travel_Frequently,1373,Research & Development,18,1,Medical,1,2051,...,2,80,0,9,2,4,4,1,0,4


In [277]:
best_data.to_csv("synth_data_without_tuning.csv", index = False)

So after 75 loops the overall score is 0.81 (this score is between 0 and 1) for 600 synthetic examples of Attrition = "Yes" so it's not too bad.

```
Best score obtained:  0.8127129012858366
Best parameters:  {'epochs': 208, 'batch_size': 42, 'embedding_dim': 10, 'gen': 54, 'dim_gen': 788}
```

# **Setting up data for round 2**
The real dataset can be split into 60:40 training/test. Once this has been split the synthetic data can be added to the training data. This in a way doesn't "waste" the real dataset in the training.

In [278]:
# Load synth data in
synth_data = pd.read_csv("synth_data_without_tuning.csv")

# Split real data into training + test set
train, test, target_train, target_test = train_test_split(hr_data.drop("Attrition", axis = 1), hr_data["Attrition"], test_size = 0.4, random_state = 42)

# Add Attrition column back into training + test set
train["Attrition"] = target_train
test["Attrition"] = target_test

# Add the 600 synthetic rows of data to training data + remove dataset column
input = pd.concat([train, synth_data])

In [279]:
synth_data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,62,Yes,Travel_Frequently,1163,Sales,1,1,Technical Degree,1,957,...,4,80,0,11,2,3,3,1,9,-1
1,42,Yes,Travel_Rarely,1407,Sales,18,1,Life Sciences,1,988,...,1,80,3,1,2,3,3,4,0,2
2,27,Yes,Travel_Frequently,254,Research & Development,26,1,Human Resources,1,1989,...,4,80,1,11,3,4,3,4,0,7
3,38,Yes,Travel_Rarely,1460,Sales,31,2,Medical,1,646,...,2,80,2,14,2,1,34,0,1,-1
4,31,Yes,Travel_Frequently,1283,Sales,-1,2,Life Sciences,1,402,...,3,80,0,5,2,3,2,3,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,22,Yes,Travel_Rarely,1235,Sales,9,1,Life Sciences,1,811,...,2,80,0,13,0,1,2,14,1,-1
596,32,Yes,Travel_Rarely,1417,Research & Development,31,5,Life Sciences,1,2054,...,1,80,0,2,2,3,4,3,1,-6
597,30,Yes,Travel_Rarely,107,Research & Development,15,4,Life Sciences,1,559,...,5,80,0,2,2,3,1,1,2,8
598,30,Yes,Travel_Rarely,786,Sales,10,3,Technical Degree,1,1657,...,3,80,2,7,0,3,6,2,0,0


In [280]:
print("Training size (real):", train.shape)
print("Training size (fake + real):", input.shape)
print("Testing size (real:", test.shape)

Training size (real): (882, 35)
Training size (fake + real): (1482, 35)
Testing size (real: (588, 35)


In [281]:
for feat in list(ord_feats.keys()):
  print(feat, input[feat].unique())

StockOptionLevel [0 1 2 3 4]
EnvironmentSatisfaction [2 3 1 4 5 0]
JobInvolvement [3 2 4 1]
JobSatisfaction [4 2 3 1 5 0]
Education [4 2 3 1 5]
PerformanceRating [3 4 5]
RelationshipSatisfaction [4 2 1 3 0 5]
WorkLifeBalance [2 3 4 1]


In [282]:
ord_feats={feat:[str(num) for num in input[feat].unique()] for feat in ord_feats.keys()}

Now this data can be fed into a new `setup` but this time declaring `test_data` as the test data that was created earlier. 

So there is a combination of **synthetic and real data in the training set** but only real in the test data.

In [283]:
# Run pycaret setup w/synthetic data
setup(input, 
      target = target, 
      test_data = test,
      fold_strategy = "stratifiedkfold",
      train_size = 0.7, 
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      data_split_stratify = True,
      ignore_low_variance = True,
      silent = True, verbose = False)

(StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
 [],
            Age  DailyRate  DistanceFromHome  Education  \
 1041 -0.707414   0.308091         -0.710776        2.0   
 184   1.676368   0.797594          0.136325        1.0   
 1222 -1.088819  -1.097544          1.089314        3.0   
 67    0.913558   1.370177         -0.499001        2.0   
 220   0.055396   1.498166         -0.710776        1.0   
 ...        ...        ...               ...        ...   
 1009  2.153125   0.732476         -1.134327        2.0   
 757  -0.135306  -1.151434         -1.134327        0.0   
 1361 -0.898117   0.824539         -0.604888        2.0   
 376   1.485666   1.008663          0.242213        1.0   
 314   0.341450  -1.373731         -0.181338        3.0   
 
       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobSatisfaction  \
 1041                      3.0    0.482524             0.0              3.0   
 184                       3.0   -0.760168             2.0    

Now setup the `compare_models` can be checked again. Interestingly the `AUC`, `Recall` and `Precision` all went up considerably. `Recall` nearly doubled!

In [284]:
compare_models(sort = "AUC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8883,0.9541,0.8445,0.9305,0.862,0.7781,0.7967,0.089
lightgbm,Light Gradient Boosting Machine,0.8762,0.9504,0.8353,0.9134,0.8486,0.7539,0.772,0.052
rf,Random Forest Classifier,0.8789,0.9471,0.8419,0.9107,0.8516,0.7593,0.776,0.146
et,Extra Trees Classifier,0.868,0.9434,0.826,0.9055,0.8455,0.7374,0.7527,0.129
ada,Ada Boost Classifier,0.8653,0.9406,0.8379,0.8906,0.849,0.7316,0.7447,0.045
lr,Logistic Regression,0.8429,0.9238,0.8323,0.8559,0.838,0.6862,0.6928,0.019
lda,Linear Discriminant Analysis,0.8375,0.9229,0.8138,0.8616,0.8308,0.6755,0.6825,0.012
qda,Quadratic Discriminant Analysis,0.7541,0.8714,0.5352,0.8796,0.6309,0.5142,0.5449,0.012
knn,K Neighbors Classifier,0.7601,0.8538,0.5958,0.8867,0.6854,0.5243,0.5538,0.038
nb,Naive Bayes,0.5959,0.845,0.2409,0.8726,0.3521,0.2038,0.2869,0.01


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8501, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

A gradient boosting classifier was decided upon.

In [285]:
gbc = create_model("gbc") # Create the gradient boosting classifier

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6577,0.7814,0.3553,0.931,0.5143,0.3238,0.414
1,0.6644,0.8396,0.3684,0.9333,0.5283,0.3368,0.4251
2,0.9595,0.9971,0.9733,0.9481,0.9605,0.9189,0.9192
3,0.9527,0.9889,0.9733,0.9359,0.9542,0.9053,0.9061
4,0.9459,0.9929,0.96,0.9351,0.9474,0.8918,0.8922
5,0.9392,0.9821,0.9467,0.9342,0.9404,0.8783,0.8784
6,0.9324,0.9898,0.9737,0.9024,0.9367,0.8645,0.8673
7,0.9527,0.9927,0.9737,0.9367,0.9548,0.9052,0.906
8,0.9527,0.9954,0.9868,0.9259,0.9554,0.9052,0.9072
9,0.9257,0.9817,0.9342,0.9221,0.9281,0.8512,0.8513


# **Generating synthetic data without tuning, adding constraint**
There's a useful library `sdev` [Synthetic Data Generation](https://github.com/sdv-dev/SDV) to generate synthetic tabular data. I've played around with it specifically the `CopulaGAN` model.

The general idea is to provide a `primary key` and to then run the model for so many `epochs`. There are hyperparameters to tune
* `epochs`
* `batch_size`
* `embedding_dim`
* `gen_dim`
* `dis_dim`
* `l2scale` (omitted in this experiment)



Rather than running a CopulaGAN without hyperparameters or guessing hyperparameters I've leveraged `BTB` [Bayesian Tuning and Bandits](https://github.com/MLBazaar/BTB) which is an optimisation library. To start I'll initialise the `GCPTuner` which is used to declare the hyperparameters to tune. Then the hyperparameters can be called using `propose` from BTB.

In [286]:
import sdv.constraints as cons
def MAX(a,b): return (abs(a-b)+(a+b))//2

def MIN(a,b): return (-abs(a-b)+(a+b))//2

def over_18(data): return MAX(data["Age"],18)

age_constraint = cons.ColumnFormula(column = 'Age',
                                    formula = over_18,
                                    handling_strategy = 'reject_sampling')

real = hr_data[hr_data["Attrition"] == "Yes"]

def ord_feats_valid(feat):
    def foo(data):
        max_entry = max(real_ord_feats[feat])
        min_entry = min(real_ord_feats[feat])
        return MIN(MAX(data[feat], min_entry), max_entry)
    return foo

ord_feats_funcs = {feat : lambda data : MIN(MAX(data[feat], int(min(real_ord_feats[feat]))), int(max(real_ord_feats[feat])))  for feat in ord_feats.keys()}

ord_feats_funcs2 = {feat : ord_feats_valid(feat) for feat in ord_feats.keys()}

#ord_feats_constraints = [cons.ColumnFormula(column = feat,
#                                            formula = ord_feats_funcs[feat],
#                                            handling_strategy = 'reject_sampling') for feat in ord_feats.keys()]

ord_feats_constraints = [cons.ColumnFormula(column = feat,
                                            formula = ord_feats_funcs2[feat],
                                            handling_strategy = 'reject_sampling') for feat in ord_feats.keys()]


In [287]:
best_score = 0 # Keep track of best score
tracker = 0 # Keep track of how many loops have completed
best_data = None

real = hr_data[hr_data["Attrition"] == "Yes"] # Filter to only those employees that left

## TRAINING LOOP START ##
for _ in range(5):

  # Increment the tracker
    tracker += 1

  # Every 5 loops output the tracker
    if tracker % 1 == 0:
        print(tracker)

  # Get the hyperparameters for this loop
    #proposal = tuner.propose(1)
  
  # Create the CopulaGAN
  # NOTE - batch_size is multiplied by 10 as needs to be a factor of 10
    #model = CopulaGAN(primary_key = "EmployeeNumber", 
    #                embedding_dim = proposal['embedding_dim'],
    #                generator_dim = (proposal['gen'], proposal['gen']),
    #                discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
    #                batch_size = proposal['batch_size'] * 10,
    #                epochs = proposal['epochs'],
    #                constraints = ord_feats_constraints+[age_constraint])
    model = CopulaGAN(constraints = ord_feats_constraints+[age_constraint])
    #model = CopulaGAN()
  
  # Fit the CopulaGAN
    model.fit(real)
  
  # Create 600 rows of data
    synth_data = model.sample(num_rows = 600, max_retries = 300)
    print(synth_data)
  
  # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data = synth_data, real_data = real)

  # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        #best_params = proposal
        best_score = score
        best_data = synth_data

  # Record the hyperparameters and score      
  #  tuner.record(proposal, score)

## TRAINING LOOP END ##


print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)

1
     Age Attrition     BusinessTravel  DailyRate              Department  \
1     34       Yes  Travel_Frequently       1111         Human Resources   
2     37       Yes      Travel_Rarely       1490                   Sales   
3     57       Yes  Travel_Frequently       1450  Research & Development   
4     52       Yes  Travel_Frequently       1018                   Sales   
6     43       Yes  Travel_Frequently        770  Research & Development   
..   ...       ...                ...        ...                     ...   
127   30       Yes      Travel_Rarely        222         Human Resources   
129   31       Yes      Travel_Rarely        121         Human Resources   
130   37       Yes      Travel_Rarely       1474                   Sales   
131   33       Yes  Travel_Frequently        263  Research & Development   
132   30       Yes      Travel_Rarely        125                   Sales   

     DistanceFromHome  Education    EducationField  EmployeeCount  \
1               

3
     Age Attrition     BusinessTravel  DailyRate              Department  \
0     44       Yes      Travel_Rarely        738                   Sales   
2     34       Yes      Travel_Rarely        805  Research & Development   
3     35       Yes      Travel_Rarely        423                   Sales   
4     45       Yes      Travel_Rarely        106                   Sales   
6     40       Yes      Travel_Rarely        573  Research & Development   
..   ...       ...                ...        ...                     ...   
173   62       Yes      Travel_Rarely       1247         Human Resources   
174   29       Yes      Travel_Rarely        194  Research & Development   
0     36       Yes      Travel_Rarely        173  Research & Development   
1     35       Yes      Travel_Rarely        917  Research & Development   
2     28       Yes  Travel_Frequently        315                   Sales   

     DistanceFromHome  Education EducationField  EmployeeCount  \
0                  

5
    Age Attrition     BusinessTravel  DailyRate              Department  \
0    21       Yes      Travel_Rarely        376  Research & Development   
1    44       Yes  Travel_Frequently       1463                   Sales   
2    34       Yes      Travel_Rarely        155  Research & Development   
3    37       Yes      Travel_Rarely        514                   Sales   
4    21       Yes         Non-Travel        113                   Sales   
..  ...       ...                ...        ...                     ...   
0    40       Yes      Travel_Rarely        190  Research & Development   
2    19       Yes      Travel_Rarely        253  Research & Development   
5    43       Yes  Travel_Frequently        114                   Sales   
0    40       Yes      Travel_Rarely        104  Research & Development   
1    36       Yes      Travel_Rarely        285                   Sales   

    DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0               

In [288]:
for feat in ord_feats.keys():
    print(feat,best_data[feat].unique())

StockOptionLevel [0 1 2 3]
EnvironmentSatisfaction [2 4 3 1]
JobInvolvement [4 2 3 1]
JobSatisfaction [4 3 2 1]
Education [1 4 2 3 5]
PerformanceRating [3 4]
RelationshipSatisfaction [4 2 3 1]
WorkLifeBalance [2 3 1 4]


In [289]:
best_data.to_csv("synth_data_with_constaint.csv", index=False)

So after 75 loops the overall score is 0.81 (this score is between 0 and 1) for 600 synthetic examples of Attrition = "Yes" so it's not too bad.

```
Best score obtained:  0.8127129012858366
Best parameters:  {'epochs': 208, 'batch_size': 42, 'embedding_dim': 10, 'gen': 54, 'dim_gen': 788}
```

# **Setting up data for round 2**
The real dataset can be split into 60:40 training/test. Once this has been split the synthetic data can be added to the training data. This in a way doesn't "waste" the real dataset in the training.

In [290]:
# Load synth data in
synth_data = pd.read_csv("synth_data_with_constaint.csv")

# Split real data into training + test set
train, test, target_train, target_test = train_test_split(hr_data.drop("Attrition", axis = 1), hr_data["Attrition"], test_size = 0.4, random_state = 42)

# Add Attrition column back into training + test set
train["Attrition"] = target_train
test["Attrition"] = target_test

# Add the 600 synthetic rows of data to training data + remove dataset column
input = pd.concat([train, synth_data])

In [291]:
print("Training size (real):", train.shape)
print("Training size (fake + real):", input.shape)
print("Testing size (real:", test.shape)

Training size (real): (882, 35)
Training size (fake + real): (1482, 35)
Testing size (real: (588, 35)


In [292]:
for feat in list(ord_feats.keys()):
  print(feat, input[feat].unique())

StockOptionLevel [0 1 2 3]
EnvironmentSatisfaction [2 3 1 4]
JobInvolvement [3 2 4 1]
JobSatisfaction [4 2 3 1]
Education [4 2 3 1 5]
PerformanceRating [3 4]
RelationshipSatisfaction [4 2 1 3]
WorkLifeBalance [2 3 4 1]


In [293]:
ord_feats={feat:[str(num) for num in input[feat].unique()] for feat in ord_feats.keys()}

Now this data can be fed into a new `setup` but this time declaring `test_data` as the test data that was created earlier. 

So there is a combination of **synthetic and real data in the training set** but only real in the test data.

In [294]:
# Run pycaret setup w/synthetic data
setup(input, 
      target = target, 
      test_data = test,
      fold_strategy = "stratifiedkfold",
      train_size = 0.7, 
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      data_split_stratify = True,
      ignore_low_variance = True,
      silent = True, verbose = False)

(StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
 [],
            Age  DailyRate  DistanceFromHome  Education  \
 1041 -0.719050  -0.120026         -0.693821        2.0   
 184   1.743130   0.376232          0.236400        1.0   
 1222 -1.112998  -1.545061          1.282899        3.0   
 67    0.955233   0.956717         -0.461266        2.0   
 220   0.068848   1.086473         -0.693821        1.0   
 ...        ...        ...               ...        ...   
 1009  2.235566   0.310216         -1.158932        2.0   
 757  -0.128126  -1.599695         -1.158932        0.0   
 1361 -0.916024   0.403549         -0.577544        2.0   
 376   1.546156   0.590215          0.352678        1.0   
 314   0.364310  -1.825060         -0.112433        3.0   
 
       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobSatisfaction  \
 1041                      3.0    0.430706             0.0              3.0   
 184                       3.0   -0.825527             2.0    

Now setup the `compare_models` can be checked again. Interestingly the `AUC`, `Recall` and `Precision` all went up considerably. `Recall` nearly doubled!

In [295]:
compare_models(sort = "AUC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8897,0.9548,0.8446,0.9277,0.8584,0.7811,0.7994,0.088
lightgbm,Light Gradient Boosting Machine,0.8876,0.9528,0.8446,0.9319,0.8613,0.7767,0.7965,0.035
rf,Random Forest Classifier,0.8836,0.9444,0.8368,0.924,0.847,0.7692,0.7892,0.142
ada,Ada Boost Classifier,0.866,0.9441,0.8459,0.879,0.846,0.7332,0.7459,0.043
et,Extra Trees Classifier,0.8741,0.941,0.8551,0.8939,0.8542,0.7493,0.7662,0.132
lr,Logistic Regression,0.8329,0.9146,0.8166,0.8455,0.8238,0.6664,0.6732,0.018
lda,Linear Discriminant Analysis,0.8281,0.9112,0.8114,0.8429,0.8189,0.6569,0.6648,0.013
qda,Quadratic Discriminant Analysis,0.6899,0.8644,0.3988,0.8893,0.5196,0.3882,0.455,0.011
nb,Naive Bayes,0.6142,0.8587,0.2662,0.904,0.4007,0.2395,0.332,0.009
knn,K Neighbors Classifier,0.7513,0.8394,0.5945,0.8563,0.6781,0.5067,0.5299,0.037


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=7572, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

A gradient boosting classifier was decided upon.

In [296]:
gbc = create_model("gbc") # Create the gradient boosting classifier

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6242,0.7505,0.3026,0.8846,0.451,0.258,0.3445
1,0.6376,0.852,0.3289,0.8929,0.4808,0.2842,0.3684
2,0.9595,0.9932,0.9733,0.9481,0.9605,0.9189,0.9192
3,0.9662,0.9947,0.9867,0.9487,0.9673,0.9324,0.9332
4,0.9797,0.9985,0.9867,0.9737,0.9801,0.9594,0.9595
5,0.9595,0.9929,0.9867,0.9367,0.961,0.9188,0.9202
6,0.9189,0.9762,0.9605,0.8902,0.9241,0.8374,0.8401
7,0.9527,0.9969,0.9868,0.9259,0.9554,0.9052,0.9072
8,0.9392,0.9958,0.9868,0.9036,0.9434,0.878,0.882
9,0.9595,0.9973,0.9474,0.973,0.96,0.9189,0.9193


# **Generating synthetic data with tuning, and with constraint**
There's a useful library `sdev` [Synthetic Data Generation](https://github.com/sdv-dev/SDV) to generate synthetic tabular data. I've played around with it specifically the `CopulaGAN` model.

The general idea is to provide a `primary key` and to then run the model for so many `epochs`. There are hyperparameters to tune
* `epochs`
* `batch_size`
* `embedding_dim`
* `gen_dim`
* `dis_dim`
* `l2scale` (omitted in this experiment)



Rather than running a CopulaGAN without hyperparameters or guessing hyperparameters I've leveraged `BTB` [Bayesian Tuning and Bandits](https://github.com/MLBazaar/BTB) which is an optimisation library. To start I'll initialise the `GCPTuner` which is used to declare the hyperparameters to tune. Then the hyperparameters can be called using `propose` from BTB.

In [297]:
tuner = GCPTuner(Tunable({
          'epochs': hp.IntHyperParam(min = 80, max = 400),
          'batch_size' : hp.IntHyperParam(min = 1, max = 100),
          'embedding_dim' : hp.IntHyperParam(min = 1, max = 100),
          'gen' : hp.IntHyperParam(min = 1, max = 1000),
          'dim_gen' : hp.IntHyperParam(min = 1, max = 1000)
        }))

In [298]:
import sdv.constraints as cons
def MAX(a,b): return (abs(a-b)+(a+b))//2

def MIN(a,b): return (-abs(a-b)+(a+b))//2

def over_18(data): return MAX(data["Age"],18)

age_constraint = cons.ColumnFormula(column = 'Age',
                                    formula = over_18,
                                    handling_strategy = 'reject_sampling')

real = hr_data[hr_data["Attrition"] == "Yes"]

def ord_feats_valid(feat):
    def foo(data):
        max_entry = max(real_ord_feats[feat])
        min_entry = min(real_ord_feats[feat])
        return MIN(MAX(data[feat], min_entry), max_entry)
    return foo

ord_feats_funcs = {feat : lambda data : MIN(MAX(data[feat], int(min(real_ord_feats[feat]))), int(max(real_ord_feats[feat])))  for feat in ord_feats.keys()}

ord_feats_funcs2 = {feat : ord_feats_valid(feat) for feat in ord_feats.keys()}

#ord_feats_constraints = [cons.ColumnFormula(column = feat,
#                                            formula = ord_feats_funcs[feat],
#                                            handling_strategy = 'reject_sampling') for feat in ord_feats.keys()]

ord_feats_constraints = [cons.ColumnFormula(column = feat,
                                            formula = ord_feats_funcs2[feat],
                                            handling_strategy = 'reject_sampling') for feat in ord_feats.keys()]


In [299]:
best_score = 0 # Keep track of best score
tracker = 0 # Keep track of how many loops have completed
best_data = None

real = hr_data[hr_data["Attrition"] == "Yes"] # Filter to only those employees that left

## TRAINING LOOP START ##
for _ in range(5):

  # Increment the tracker
    tracker += 1

  # Every 5 loops output the tracker
    if tracker % 1 == 0:
        print(tracker)

  # Get the hyperparameters for this loop
    proposal = tuner.propose(1)
  
  # Create the CopulaGAN
  # NOTE - batch_size is multiplied by 10 as needs to be a factor of 10
    model = CopulaGAN(primary_key = "EmployeeNumber", 
                    embedding_dim = proposal['embedding_dim'],
                    generator_dim = (proposal['gen'], proposal['gen']),
                    discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
                    batch_size = proposal['batch_size'] * 10,
                    epochs = proposal['epochs'],
                    constraints = ord_feats_constraints+[age_constraint])
    #model = CopulaGAN(constraints = ord_feats_constraints+[age_constraint])
    #model = CopulaGAN()
  
  # Fit the CopulaGAN
    model.fit(real)
  
  # Create 600 rows of data
    synth_data = model.sample(600, max_retries = 300)
    #print(synth_data)
  
  # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data = synth_data, real_data = real)

  # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        best_params = proposal
        best_score = score
        best_data = synth_data

  # Record the hyperparameters and score      
    tuner.record(proposal, score)

## TRAINING LOOP END ##


print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)

1
2
3
4
5
Best score obtained:  0.5256269051129521
Best parameters:  {'epochs': 291, 'batch_size': 60, 'embedding_dim': 1, 'gen': 17, 'dim_gen': 523}


In [300]:
best_data.to_csv("synth_data_with_constaint_with_tuner.csv", index = False)

So after 75 loops the overall score is 0.81 (this score is between 0 and 1) for 600 synthetic examples of Attrition = "Yes" so it's not too bad.

```
Best score obtained:  0.8127129012858366
Best parameters:  {'epochs': 208, 'batch_size': 42, 'embedding_dim': 10, 'gen': 54, 'dim_gen': 788}
```

# **Setting up data for round 2**
The real dataset can be split into 60:40 training/test. Once this has been split the synthetic data can be added to the training data. This in a way doesn't "waste" the real dataset in the training.

In [301]:
# Load synth data in
synth_data = pd.read_csv("synth_data_with_constaint_with_tuner.csv")

# Split real data into training + test set
train, test, target_train, target_test = train_test_split(hr_data.drop("Attrition", axis = 1), hr_data["Attrition"], test_size = 0.4, random_state = 42)

# Add Attrition column back into training + test set
train["Attrition"] = target_train
test["Attrition"] = target_test

# Add the 600 synthetic rows of data to training data + remove dataset column
input = pd.concat([train, synth_data])

In [302]:
print("Training size (real):", train.shape)
print("Training size (fake + real):", input.shape)
print("Testing size (real:", test.shape)

Training size (real): (882, 35)
Training size (fake + real): (1482, 35)
Testing size (real: (588, 35)


In [303]:
for feat in list(ord_feats.keys()):
  print(feat, input[feat].unique())

StockOptionLevel [0 1 2 3]
EnvironmentSatisfaction [2 3 1 4]
JobInvolvement [3 2 4 1]
JobSatisfaction [4 2 3 1]
Education [4 2 3 1 5]
PerformanceRating [3 4]
RelationshipSatisfaction [4 2 1 3]
WorkLifeBalance [2 3 4 1]


In [304]:
ord_feats={feat:[str(num) for num in input[feat].unique()] for feat in ord_feats.keys()}

Now this data can be fed into a new `setup` but this time declaring `test_data` as the test data that was created earlier. 

So there is a combination of **synthetic and real data in the training set** but only real in the test data.

In [305]:
# Run pycaret setup w/synthetic data
setup(input, 
      target = target, 
      test_data = test,
      fold_strategy = "stratifiedkfold",
      train_size = 0.7, 
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      data_split_stratify = True,
      ignore_low_variance = True,
      silent = True, verbose = False)

(StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
 [],
            Age  DailyRate  DistanceFromHome  Education  \
 1041 -0.896153   0.195767         -0.601101        2.0   
 184   1.972806   0.732377          0.353958        1.0   
 1222 -1.355186  -1.345142          1.428401        3.0   
 67    1.054739   1.360064         -0.362336        2.0   
 220   0.021914   1.500370         -0.601101        1.0   
 ...        ...        ...               ...        ...   
 1009  2.546597   0.660993         -1.078631        2.0   
 757  -0.207603  -1.404218         -1.078631        0.0   
 1361 -1.125669   0.761915         -0.481719        2.0   
 376   1.743289   0.963760          0.473341        1.0   
 314   0.366189  -1.647908         -0.004189        3.0   
 
       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobSatisfaction  \
 1041                      3.0    0.825373             0.0              3.0   
 184                       3.0   -0.481364             2.0    

Now setup the `compare_models` can be checked again. Interestingly the `AUC`, `Recall` and `Precision` all went up considerably. `Recall` nearly doubled!

In [306]:
compare_models(sort = "AUC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8795,0.9529,0.8669,0.8919,0.865,0.7599,0.7736,0.15
gbc,Gradient Boosting Classifier,0.8903,0.9485,0.8669,0.9142,0.8742,0.7815,0.7964,0.103
lightgbm,Light Gradient Boosting Machine,0.8815,0.9451,0.8656,0.8971,0.8672,0.7639,0.7771,0.045
et,Extra Trees Classifier,0.8693,0.9383,0.8589,0.8798,0.8573,0.7394,0.7515,0.142
ada,Ada Boost Classifier,0.8429,0.9135,0.8416,0.8497,0.8401,0.6861,0.6926,0.057
qda,Quadratic Discriminant Analysis,0.7723,0.8566,0.5934,0.874,0.678,0.5493,0.5698,0.017
knn,K Neighbors Classifier,0.7661,0.8467,0.6409,0.8637,0.7147,0.5354,0.5588,0.322
lda,Linear Discriminant Analysis,0.7564,0.8449,0.7553,0.7666,0.7596,0.5126,0.5145,0.013
lr,Logistic Regression,0.7551,0.8438,0.762,0.7606,0.7601,0.5098,0.5115,0.799
nb,Naive Bayes,0.6114,0.7764,0.3003,0.8304,0.4371,0.2325,0.3033,0.015


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5415, verbose=0,
                       warm_start=False)

A gradient boosting classifier was decided upon.

In [307]:
gbc = create_model("gbc") # Create the gradient boosting classifier

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.698,0.7471,0.4737,0.878,0.6154,0.4014,0.4535
1,0.698,0.8531,0.4342,0.9429,0.5946,0.4024,0.4797
2,0.9459,0.9801,0.9733,0.9241,0.9481,0.8918,0.8931
3,0.9189,0.9759,0.9333,0.9091,0.9211,0.8377,0.8381
4,0.9392,0.9918,0.9467,0.9342,0.9404,0.8783,0.8784
5,0.9527,0.9898,0.9733,0.9359,0.9542,0.9053,0.9061
6,0.9257,0.9823,0.9868,0.8824,0.9317,0.8508,0.8572
7,0.9257,0.9843,0.9868,0.8824,0.9317,0.8508,0.8572
8,0.9257,0.9874,0.9737,0.8916,0.9308,0.8509,0.8547
9,0.973,0.9931,0.9868,0.9615,0.974,0.9459,0.9462


# **Generating synthetic data without tuning, and without constraint**
There's a useful library `sdev` [Synthetic Data Generation](https://github.com/sdv-dev/SDV) to generate synthetic tabular data. I've played around with it specifically the `CopulaGAN` model.

The general idea is to provide a `primary key` and to then run the model for so many `epochs`. There are hyperparameters to tune
* `epochs`
* `batch_size`
* `embedding_dim`
* `gen_dim`
* `dis_dim`
* `l2scale` (omitted in this experiment)



Rather than running a CopulaGAN without hyperparameters or guessing hyperparameters I've leveraged `BTB` [Bayesian Tuning and Bandits](https://github.com/MLBazaar/BTB) which is an optimisation library. To start I'll initialise the `GCPTuner` which is used to declare the hyperparameters to tune. Then the hyperparameters can be called using `propose` from BTB.

In [310]:
best_score = 0 # Keep track of best score
tracker = 0 # Keep track of how many loops have completed
best_data = None

real = hr_data[hr_data["Attrition"] == "Yes"] # Filter to only those employees that left

## TRAINING LOOP START ##
for _ in range(5):

  # Increment the tracker
    tracker += 1

  # Every 5 loops output the tracker
    if tracker % 1 == 0:
        print(tracker)

  # Get the hyperparameters for this loop
    #proposal = tuner.propose(1)
  
  # Create the CopulaGAN
  # NOTE - batch_size is multiplied by 10 as needs to be a factor of 10
    #model = CopulaGAN(primary_key = "EmployeeNumber", 
    #                embedding_dim = proposal['embedding_dim'],
    #                generator_dim = (proposal['gen'], proposal['gen']),
    #                discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
    #                batch_size = proposal['batch_size'] * 10,
    #                epochs = proposal['epochs'],
    #                constraints = ord_feats_constraints+[age_constraint])
    #model = CopulaGAN(constraints = ord_feats_constraints+[age_constraint])
    model = CopulaGAN()
  
  # Fit the CopulaGAN
    model.fit(real)
  
  # Create 600 rows of data
    synth_data = model.sample(600, max_retries = 300)
    #print(synth_data)
  
  # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data = synth_data, real_data = real)

  # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        best_params = proposal
        best_score = score
        best_data = synth_data

  # Record the hyperparameters and score      
    #tuner.record(proposal, score)

## TRAINING LOOP END ##


print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)

1
2
3
4
5
Best score obtained:  0.5046653249677896
Best parameters:  {'epochs': 372, 'batch_size': 65, 'embedding_dim': 75, 'gen': 70, 'dim_gen': 208}


In [311]:
best_data.to_csv("synth_data_without_constaint_without_tuner.csv", index = False)

So after 75 loops the overall score is 0.81 (this score is between 0 and 1) for 600 synthetic examples of Attrition = "Yes" so it's not too bad.

```
Best score obtained:  0.8127129012858366
Best parameters:  {'epochs': 208, 'batch_size': 42, 'embedding_dim': 10, 'gen': 54, 'dim_gen': 788}
```

# **Setting up data for round 2**
The real dataset can be split into 60:40 training/test. Once this has been split the synthetic data can be added to the training data. This in a way doesn't "waste" the real dataset in the training.

In [312]:
# Load synth data in
synth_data = pd.read_csv("synth_data_with_constaint_with_tuner.csv")

# Split real data into training + test set
train, test, target_train, target_test = train_test_split(hr_data.drop("Attrition", axis = 1), hr_data["Attrition"], test_size = 0.4, random_state = 42)

# Add Attrition column back into training + test set
train["Attrition"] = target_train
test["Attrition"] = target_test

# Add the 600 synthetic rows of data to training data + remove dataset column
input = pd.concat([train, synth_data])

In [313]:
print("Training size (real):", train.shape)
print("Training size (fake + real):", input.shape)
print("Testing size (real:", test.shape)

Training size (real): (882, 35)
Training size (fake + real): (1482, 35)
Testing size (real: (588, 35)


In [314]:
for feat in list(ord_feats.keys()):
  print(feat, input[feat].unique())

StockOptionLevel [0 1 2 3]
EnvironmentSatisfaction [2 3 1 4]
JobInvolvement [3 2 4 1]
JobSatisfaction [4 2 3 1]
Education [4 2 3 1 5]
PerformanceRating [3 4]
RelationshipSatisfaction [4 2 1 3]
WorkLifeBalance [2 3 4 1]


In [315]:
ord_feats={feat:[str(num) for num in input[feat].unique()] for feat in ord_feats.keys()}

Now this data can be fed into a new `setup` but this time declaring `test_data` as the test data that was created earlier. 

So there is a combination of **synthetic and real data in the training set** but only real in the test data.

In [316]:
# Run pycaret setup w/synthetic data
setup(input, 
      target = target, 
      test_data = test,
      fold_strategy = "stratifiedkfold",
      train_size = 0.7, 
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      data_split_stratify = True,
      ignore_low_variance = True,
      silent = True, verbose = False)

(StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
 [],
            Age  DailyRate  DistanceFromHome  Education  \
 1041 -0.896153   0.195767         -0.601101        2.0   
 184   1.972806   0.732377          0.353958        1.0   
 1222 -1.355186  -1.345142          1.428401        3.0   
 67    1.054739   1.360064         -0.362336        2.0   
 220   0.021914   1.500370         -0.601101        1.0   
 ...        ...        ...               ...        ...   
 1009  2.546597   0.660993         -1.078631        2.0   
 757  -0.207603  -1.404218         -1.078631        0.0   
 1361 -1.125669   0.761915         -0.481719        2.0   
 376   1.743289   0.963760          0.473341        1.0   
 314   0.366189  -1.647908         -0.004189        3.0   
 
       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobSatisfaction  \
 1041                      3.0    0.825373             0.0              3.0   
 184                       3.0   -0.481364             2.0    

Now setup the `compare_models` can be checked again. Interestingly the `AUC`, `Recall` and `Precision` all went up considerably. `Recall` nearly doubled!

In [317]:
compare_models(sort = "AUC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8916,0.9524,0.8828,0.9029,0.8785,0.784,0.7986,0.155
gbc,Gradient Boosting Classifier,0.8896,0.9484,0.8669,0.913,0.8736,0.7802,0.7951,0.095
lightgbm,Light Gradient Boosting Machine,0.8815,0.9451,0.8656,0.8971,0.8672,0.7639,0.7771,0.053
et,Extra Trees Classifier,0.8707,0.9387,0.8523,0.8883,0.8586,0.7421,0.7533,0.139
ada,Ada Boost Classifier,0.8429,0.9135,0.8416,0.8497,0.8401,0.6861,0.6926,0.048
qda,Quadratic Discriminant Analysis,0.7723,0.8566,0.5934,0.874,0.678,0.5493,0.5698,0.014
knn,K Neighbors Classifier,0.7661,0.8467,0.6409,0.8637,0.7147,0.5354,0.5588,0.33
lda,Linear Discriminant Analysis,0.7564,0.8449,0.7553,0.7666,0.7596,0.5126,0.5145,0.013
lr,Logistic Regression,0.7551,0.8438,0.762,0.7606,0.7601,0.5098,0.5115,0.851
nb,Naive Bayes,0.6114,0.7764,0.3003,0.8304,0.4371,0.2325,0.3033,0.015


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=3643, verbose=0,
                       warm_start=False)

A gradient boosting classifier was decided upon.

In [318]:
gbc = create_model("gbc") # Create the gradient boosting classifier

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.698,0.7471,0.4737,0.878,0.6154,0.4014,0.4535
1,0.698,0.8524,0.4342,0.9429,0.5946,0.4024,0.4797
2,0.9392,0.9801,0.9733,0.9125,0.9419,0.8782,0.8803
3,0.9189,0.9759,0.9333,0.9091,0.9211,0.8377,0.8381
4,0.9392,0.9918,0.9467,0.9342,0.9404,0.8783,0.8784
5,0.9527,0.9898,0.9733,0.9359,0.9542,0.9053,0.9061
6,0.9257,0.9823,0.9868,0.8824,0.9317,0.8508,0.8572
7,0.9257,0.9843,0.9868,0.8824,0.9317,0.8508,0.8572
8,0.9257,0.987,0.9737,0.8916,0.9308,0.8509,0.8547
9,0.973,0.9931,0.9868,0.9615,0.974,0.9459,0.9462


# **Finalising the model**
Now with the model trained and evaluated it needs to be **finalised**. This encorporates the test data into the model that will be used on unseen data.

In [319]:
final_gbc = finalize_model(gbc)

**NOTE**: I haven't further explored the model e.g. feature importance as the purpose of this project was to focus on synthetic data generation and a streamlit app.

# **Saving and exporting ready for app build**
To use the model in a `streamlit` app certain things need to be exported
* The model
* Preprocessed data
* Preparation steps (`prep pipe`)

In [None]:
# Model (save)
save_model(final_gbc, "final_gbc")

# Preparation steps (get + save)
prep = get_config("prep_pipe")
joblib.dump(prep, 'prep_pipe.pkl', compress = 1)

# Preprocessed data (get + save)
get_config("X").to_csv("preprocssed_data.csv", index = False)

# **Streamlit app**
The final output of this was a `streamlit` app to allow single predictions w/reason plots or multi predictions from an uploaded csv (see `README.md` and `app.py`).

# **Conclusion**
I think that synthetic data generation can be a powerful tool especially in situations when further data collection cannot be performed in the near term. Fine tuning hyperparameters is great to have but more data and feature engineering are still superior.