In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")
dataset_location = data_folder / "hcvdat0.csv"

# Necessary for notebook to see src
import sys
sys.path.append('../..')

from src.visualization import plot_corelation_matrix, plot_continuous_columns

Device: cuda


# Load Dataset


In [2]:
original_dataset = pd.read_csv(dataset_location)
dataset = original_dataset.copy(deep=True)

dataset.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
dataset.head(10)

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
5,0=Blood Donor,32,m,41.6,43.3,18.5,19.7,12.3,9.92,6.05,111.0,91.0,74.0
6,0=Blood Donor,32,m,46.3,41.3,17.5,17.8,8.5,7.01,4.79,70.0,16.9,74.5
7,0=Blood Donor,32,m,42.2,41.9,35.8,31.1,16.1,5.82,4.6,109.0,21.5,67.1
8,0=Blood Donor,32,m,50.9,65.5,23.2,21.2,6.9,8.69,4.1,83.0,13.7,71.3
9,0=Blood Donor,32,m,42.4,86.3,20.3,20.0,35.2,5.46,4.45,81.0,15.9,69.9


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  615 non-null    object 
 1   Age       615 non-null    int64  
 2   Sex       615 non-null    object 
 3   ALB       614 non-null    float64
 4   ALP       597 non-null    float64
 5   ALT       614 non-null    float64
 6   AST       615 non-null    float64
 7   BIL       615 non-null    float64
 8   CHE       615 non-null    float64
 9   CHOL      605 non-null    float64
 10  CREA      615 non-null    float64
 11  GGT       615 non-null    float64
 12  PROT      614 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 62.6+ KB


# Label Encoding Categorical Values

In [5]:
from sklearn.preprocessing import OrdinalEncoder

discrete_columns = ['Category', 'Sex']

dataset['Category'] = dataset['Category'].str[0].astype(int)

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

dataset['Sex'] = encoder.fit_transform(dataset[['Sex']])


# Detect Metadata

In [6]:
from src.evaluation.synthesizer_evaluation import SynthesizerEvaluation
from sdv.metadata.single_table import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(dataset)

# Check The Metadata

In [7]:
types = pd.DataFrame(
    data={
        'Dataset': ['categorical' if col in  discrete_columns else 'numerical' for col in dataset.columns],
        'metadata': [metadata.columns[key]['sdtype'] for key in metadata.columns]
    }   
)

types

Unnamed: 0,Dataset,metadata
0,categorical,categorical
1,numerical,numerical
2,categorical,categorical
3,numerical,numerical
4,numerical,numerical
5,numerical,numerical
6,numerical,numerical
7,numerical,numerical
8,numerical,numerical
9,numerical,numerical


In [8]:
dataset.head(10)

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0,32,1.0,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0,32,1.0,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0,32,1.0,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0,32,1.0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0,32,1.0,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
5,0,32,1.0,41.6,43.3,18.5,19.7,12.3,9.92,6.05,111.0,91.0,74.0
6,0,32,1.0,46.3,41.3,17.5,17.8,8.5,7.01,4.79,70.0,16.9,74.5
7,0,32,1.0,42.2,41.9,35.8,31.1,16.1,5.82,4.6,109.0,21.5,67.1
8,0,32,1.0,50.9,65.5,23.2,21.2,6.9,8.69,4.1,83.0,13.7,71.3
9,0,32,1.0,42.4,86.3,20.3,20.0,35.2,5.46,4.45,81.0,15.9,69.9


# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    dataset.iloc[:, 1:],
    dataset.iloc[:, 0],
    random_state=42,
    shuffle=True,
    stratify=dataset.iloc[:, 0],
    test_size=0.25
)


# Import Relevant Libraries

In [10]:
from src.metrics.js_divergence import JSDivergence                  # Discrete Columns
from src.metrics.wasserstein_distance import WassersteinDistance    # Continuous Columns
from src.metrics.propensity_score import PropensityScore            # All Columns
from src.metrics.pcorr_difference import PairwiseCorrelationDifference
import optuna

  from .autonotebook import tqdm as notebook_tqdm


# Objective

In [None]:
from sdv.single_table import CTGANSynthesizer
from src.synthesizers import save_model
from src.schemas.trial_info import TrialInfo, SynthesizerHyperParameters


def objective(trial: optuna.Trial):
    discriminator_steps = trial.suggest_int('discriminator_steps', 1, 5)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-7, 5e-4, log=True)
    epochs = trial.suggest_int('epochs', 300, 2000, 10)
    
    
    
    synthesizer = CTGANSynthesizer(
        metadata, 
        generator_lr=lr, 
        discriminator_steps=discriminator_steps,
        generator_decay=weight_decay,
        discriminator_decay=weight_decay,
        epochs=epochs, 
        discriminator_lr=lr,
        verbose=False,
    )

    synthesizer.fit(dataset)

    synthetic_data = synthesizer.sample(len(dataset))

    js_metric = JSDivergence.compute(dataset, synthetic_data, metadata)
    wd_metric = WassersteinDistance.compute(dataset, synthetic_data, metadata)
    propensity_score = PropensityScore.compute(dataset, synthetic_data, metadata)
    pcorr_difference = PairwiseCorrelationDifference.compute(dataset, synthetic_data, metadata)

    file_path = save_model(synthesizer, 'CTGAN', 'hcvdata', folder_path='../../models')

    trial.set_user_attr('file_path', file_path)
 
    return js_metric, wd_metric, propensity_score, pcorr_difference




# STUDY

In [12]:
study = optuna.study.create_study(
    directions=['minimize', 'minimize', 'minimize', 'minimize']
)
study.set_metric_names(['JSD','WD','PS','PCD'])
study.optimize(objective, n_trials=150, n_jobs=1)

[I 2024-12-03 02:07:14,994] A new study created in memory with name: no-name-dbfb7bae-1fc5-4a59-8c20-28ad027f2e98
  study.set_metric_names(['JSD','WD','PS','PCD'])
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[I 2024-12-03 02:08:55,820] Trial 0 finished with values: {'JSD': 0.1096745079350184, 'WD': 0.0650259808262147, 'PS': 0.025520463175133637, 'PCD': 1.967028820382782} and parameters: {'discriminator_steps': 1, 'lr': 0.00800593064551969, 'weight_decay': 1.304591073293801e-05, 'epochs': 1050}.
[I 2024-12-03 02:09:42,016] Trial 1 finished with values: {'JSD': 0.13533543955546817, 'WD': 0.0347727772864245, 'PS': 0.09376604437950838, 'PCD': 1.4820831958979375} and parameters: {'discriminator_steps': 3, 'lr': 0.0007770294667215705, 'weight_decay': 3.2877492172719116e-06, 'epochs': 1170}.
[I 2024-12-03 02:10:29,387] Trial 2 finished with values: {'JSD': 0.11162517942005167, 'WD': 0.03396262977794898, 'PS': 0.02568517981437939, 'PC

In [13]:
trials_made = study.trials_dataframe(attrs=('params', 'value', 'user_attrs'), multi_index=True)
trials_made.to_csv('trial_info.csv', index=False)

In [15]:
study.best_trials

[FrozenTrial(number=5, state=1, values=[0.10317850653697606, 0.022684943193267863, 0.026460541289004543, 1.7424590452220508], datetime_start=datetime.datetime(2024, 12, 3, 2, 11, 35, 122592), datetime_complete=datetime.datetime(2024, 12, 3, 2, 12, 7, 545097), params={'discriminator_steps': 2, 'lr': 0.0017337396183780076, 'weight_decay': 2.292594775662668e-07, 'epochs': 1180}, user_attrs={'file_path': '../../models/hcvdata/CTGAN/40d7a14633877a990dca5f0a2f3962caa84d81bac749c3dde3d19b19cde57d43.pth'}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'discriminator_steps': IntDistribution(high=5, log=False, low=1, step=1), 'lr': FloatDistribution(high=0.01, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.0005, log=True, low=1e-07, step=None), 'epochs': IntDistribution(high=2000, log=False, low=300, step=10)}, trial_id=5, value=None),
 FrozenTrial(number=8, state=1, values=[0.09404759917211217, 0.038961192331092485, 0.01759788481323462, 

In [20]:
display(trials_made.iloc[trials_made['values']['JSD'].argmin()])
file_path = trials_made.iloc[trials_made['values']['JSD'].argmin()]['user_attrs']['file_path']

params      discriminator_steps                                                    2
            epochs                                                               430
            lr                                                              0.000019
            weight_decay                                                         0.0
values      JSD                                                             0.087466
            PCD                                                             2.143825
            PS                                                              0.029104
            WD                                                              0.028326
user_attrs  file_path              ../../models/hcvdata/CTGAN/ec249e28ddb29ed8094...
Name: 140, dtype: object

In [21]:
import torch

best_synhtesizer: CTGANSynthesizer = torch.load(file_path)
best_synhtesizer.get_loss_values_plot()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
display(trials_made.iloc[146])
second_file_path = trials_made.iloc[146]['user_attrs']['file_path']

second_best_synhtesizer: CTGANSynthesizer = torch.load(second_file_path)
second_best_synhtesizer.get_loss_values_plot()

params      discriminator_steps                                                    5
            epochs                                                              1670
            lr                                                              0.003969
            weight_decay                                                    0.000003
values      JSD                                                             0.105602
            PCD                                                              1.56544
            PS                                                              0.021072
            WD                                                              0.042669
user_attrs  file_path              ../../models/hcvdata/CTGAN/28640285d1c104117d9...
Name: 146, dtype: object

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# Original Dataset

In [36]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

original_model = XGBClassifier(device=device, verbosity=2)

original_model.fit(X_train, y_train)
prediction = original_model.predict(X_test)


print(f'F1: {f1_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Precision: {precision_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')

print(f'\n Full Report:\n {classification_report(y_true=y_test, y_pred=prediction)}')

F1:     0.6995
Precision:     0.7297
Recall:     0.6813

 Full Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       135
           1       0.60      0.50      0.55         6
           2       0.50      0.60      0.55         5
           3       0.83      0.62      0.71         8

    accuracy                           0.95       154
   macro avg       0.73      0.68      0.70       154
weighted avg       0.95      0.95      0.95       154



# Synhtetic Dataset

In [38]:
synhtetic_data = best_synhtesizer.sample(1000)
X_synhtetic = synhtetic_data.iloc[:, 1:]
y_synhtetic = synhtetic_data.iloc[:, 0]

synhtetic_model = XGBClassifier(device=device, verbosity=2, max_depth=20)

synhtetic_model.fit(X_synhtetic, y_synhtetic)
prediction = synhtetic_model.predict(X_test)


print(f'F1: {f1_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Precision: {precision_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')

print(f'\n Full Report:\n {classification_report(y_true=y_test, y_pred=prediction)}')

F1:     0.2289
Precision:     0.2181
Recall:     0.2407

 Full Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.92       135
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         8

    accuracy                           0.84       154
   macro avg       0.22      0.24      0.23       154
weighted avg       0.76      0.84      0.80       154



In [40]:
y_synhtetic.value_counts()

Category
0    669
3    127
1    103
2    101
Name: count, dtype: int64

In [35]:
y_train.value_counts()

Category
0    405
3     22
1     18
2     16
Name: count, dtype: int64

In [42]:
from sdv.sampling import Condition

synhtetic_samples = best_synhtesizer.sample_from_conditions(
    conditions=[
        Condition(column_values={'Category': 1}, num_rows=(len(y_train[y_train == 0]) - len(y_train[y_train == 1]))),
        Condition(column_values={'Category': 2}, num_rows=(len(y_train[y_train == 0]) - len(y_train[y_train == 2]))),
        Condition(column_values={'Category': 3}, num_rows=(len(y_train[y_train == 0]) - len(y_train[y_train == 3]))),
    ]
)

Sampling conditions: 100%|██████████| 1159/1159 [00:00<00:00, 1277.16it/s]


In [44]:
synhtetic_samples.Category.value_counts()

Category
2    389
1    387
3    383
Name: count, dtype: int64

In [51]:
original_data = pd.concat([y_train, X_train], axis=1)
augmented_data = pd.concat([original_data, synhtetic_samples])

display(augmented_data.head(10))
augmented_data.Category.value_counts()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
429,0,48,0.0,45.3,40.6,18.5,27.7,5.7,7.48,4.64,66.0,19.6,70.6
578,2,38,0.0,40.0,39.8,14.9,68.9,11.0,8.55,4.31,60.5,40.1,76.5
324,0,33,0.0,44.3,74.0,49.7,52.3,8.5,6.49,3.34,73.0,44.7,73.8
42,0,35,1.0,27.8,99.0,30.7,27.8,9.4,6.8,4.27,65.0,40.5,80.7
100,0,41,1.0,45.1,86.8,24.0,23.9,10.6,7.84,6.23,72.0,35.3,75.1
107,0,41,1.0,37.4,75.1,28.0,25.7,4.1,10.62,5.57,83.0,18.6,71.9
511,0,59,0.0,43.0,82.4,33.1,30.0,7.5,9.2,7.43,61.0,30.7,75.4
202,0,50,1.0,43.8,56.9,29.5,32.8,4.1,7.77,5.8,72.0,53.6,72.5
465,0,52,0.0,43.8,52.0,15.5,23.9,6.0,7.93,5.41,69.0,11.9,72.4
210,0,51,1.0,42.0,84.3,14.7,19.2,3.2,8.19,4.68,81.0,20.9,77.1


Category
0    405
2    405
3    405
1    405
Name: count, dtype: int64

# Augmented Dataset

In [None]:
X_augmented = augmented_data.iloc[:, 1:]
y_augmented = augmented_data.iloc[:, 0]

synhtetic_model = XGBClassifier(device=device, verbosity=2, max_depth=20)

synhtetic_model.fit(X_augmented, y_augmented)
prediction = synhtetic_model.predict(X_test)

print(f'F1: {f1_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Precision: {precision_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')

print(f'\n Full Report:\n {classification_report(y_true=y_test, y_pred=prediction)}')

F1:     0.5689
Precision:     0.5386
Recall:     0.6572

 Full Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95       135
           1       0.50      0.50      0.50         6
           2       0.20      0.60      0.30         5
           3       0.45      0.62      0.53         8

    accuracy                           0.86       154
   macro avg       0.54      0.66      0.57       154
weighted avg       0.93      0.86      0.89       154



# Alternative Synhtesizer

In [53]:
from sdv.sampling import Condition

alternative_synhtetic_samples = second_best_synhtesizer.sample_from_conditions(
    conditions=[
        Condition(column_values={'Category': 1}, num_rows=(len(y_train[y_train == 0]) - len(y_train[y_train == 1]))),
        Condition(column_values={'Category': 2}, num_rows=(len(y_train[y_train == 0]) - len(y_train[y_train == 2]))),
        Condition(column_values={'Category': 3}, num_rows=(len(y_train[y_train == 0]) - len(y_train[y_train == 3]))),
    ]
)

alternative_augmented_data = pd.concat([original_data, alternative_synhtetic_samples])

X_augmented = alternative_augmented_data.iloc[:, 1:]
y_augmented = alternative_augmented_data.iloc[:, 0]

synhtetic_model = XGBClassifier(device=device, verbosity=2, max_depth=20)

synhtetic_model.fit(X_augmented, y_augmented)
prediction = synhtetic_model.predict(X_test)

print(f'F1: {f1_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Precision: {precision_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=prediction, average='macro'):>10.4f}')

print(f'\n Full Report:\n {classification_report(y_true=y_test, y_pred=prediction)}')

Sampling conditions: 100%|██████████| 1159/1159 [00:00<00:00, 1406.77it/s]


F1:     0.6950
Precision:     0.6417
Recall:     0.7894

 Full Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97       135
           1       0.50      0.67      0.57         6
           2       0.40      0.80      0.53         5
           3       0.67      0.75      0.71         8

    accuracy                           0.92       154
   macro avg       0.64      0.79      0.70       154
weighted avg       0.94      0.92      0.93       154



# Implement the Paper