In [12]:
# Colab was used for this notebook hence the library installs
%%capture
!pip install pycaret --quiet --upgrade
!pip install dabl --quiet --upgrade
!pip install shap --quiet --upgrade
!pip install sdv  --quiet --upgrade
!pip install sdv[ctgan] --quiet --upgrade
!pip install baytune --quiet --upgrade
!pip install optuna --quiet --upgrade

UsageError: Line magic function `%%capture` not found.


# Here we will present the bug of 1d-array

In [2]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
#import dabl as db # Summary plot
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
from btb.tuning import Tunable, GCPTuner # CopulaGAN optimising
from btb.tuning import hyperparams as hp  # Set hyperparameters for optimising
import joblib # Saving preparation steps

# *First we use fake data to present the bug*

In [3]:
x=np.random.rand(1000)*50
y=np.random.rand(1000)*50
z=np.random.rand(1000)*50
df=pd.DataFrame(pd.DataFrame([x,y,z]).values.T,columns = ["x","y","z"])

In [4]:
tuner = GCPTuner(Tunable({
          'epochs': hp.IntHyperParam(min = 80, max = 400),
          'batch_size' : hp.IntHyperParam(min = 1, max = 100),
          'embedding_dim' : hp.IntHyperParam(min = 1, max = 100),
          'gen' : hp.IntHyperParam(min = 1, max = 1000),
          'dim_gen' : hp.IntHyperParam(min = 1, max = 1000)
        }))

In [None]:
tracker = 0 
real = df
for _ in range(50):
    tracker += 1
    print(tracker)
    proposal = tuner.propose(1)
    model = CopulaGAN(embedding_dim = proposal['embedding_dim'],
                    generator_dim = (proposal['gen'], proposal['gen']),
                    discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
                    batch_size = proposal['batch_size'] * 10,
                    epochs = proposal['epochs'])
    model.fit(real)
    synth_data = model.sample(600, max_retries = 300)
    score = evaluate(synthetic_data = synth_data, real_data = real)
    tuner.record(proposal, score)

1
2


# *Now we present a real example.*

In [None]:
# Read and output the top 5 rows
hr_data = pd.read_csv("data/HR Employee Attrition.csv")
hr_data.head()

Try the data below several times and this error will appear.

In [None]:
tuner = GCPTuner(Tunable({
          'epochs': hp.IntHyperParam(min = 80, max = 400),
          'batch_size' : hp.IntHyperParam(min = 1, max = 100),
          'embedding_dim' : hp.IntHyperParam(min = 1, max = 100),
          'gen' : hp.IntHyperParam(min = 1, max = 1000),
          'dim_gen' : hp.IntHyperParam(min = 1, max = 1000)
        }))

In [None]:
best_score = 0 # Keep track of best score
tracker = 0 # Keep track of how many loops have completed

real = hr_data[hr_data["Attrition"] == "Yes"] # Filter to only those employees that left

## TRAINING LOOP START ##
for _ in range(50):

  # Increment the tracker
    tracker += 1

  # Every 5 loops output the tracker
    if tracker % 1 == 0:
        print(tracker)

  # Get the hyperparameters for this loop
    proposal = tuner.propose(1)
  
  # Create the CopulaGAN
  # NOTE - batch_size is multiplied by 10 as needs to be a factor of 10
    model = CopulaGAN(primary_key = "EmployeeNumber", 
                    embedding_dim = proposal['embedding_dim'],
                    generator_dim = (proposal['gen'], proposal['gen']),
                    discriminator_dim = (proposal['dim_gen'], proposal['dim_gen']),
                    batch_size = proposal['batch_size'] * 10,
                    epochs = proposal['epochs'])
  
  # Fit the CopulaGAN
    model.fit(real)
  
  # Create 600 rows of data
    synth_data = model.sample(600, max_retries = 300)
  
  # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data = synth_data, real_data = real)

  # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        best_params = proposal
        best_score = score

  # Record the hyperparameters and score      
    tuner.record(proposal, score)

## TRAINING LOOP END ##


print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)