In [36]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")

# Necessary for notebook to see src package
import sys
sys.path.append('../..')

Device: cuda


# Load Dataset and Indices

In [38]:
insurance_dataset = pd.read_csv(data_folder / 'Insurance' / 'data.csv')
king_dataset = pd.read_csv(data_folder / 'King' / 'data.csv')

insurance_train_idx = np.load(data_folder / 'Insurance' / 'train_idx.npy')
insurance_test_idx = np.load(data_folder / 'Insurance' / 'test_idx.npy')
king_train_idx = np.load(data_folder / 'King' / 'train_idx.npy')
king_test_idx = np.load(data_folder / 'King' / 'test_idx.npy')

# Train Test Split

In [39]:
insurance_train, insurance_test = insurance_dataset.iloc[insurance_train_idx, :], insurance_dataset.iloc[insurance_test_idx, :]
king_train, king_test = king_dataset.iloc[king_train_idx, :], king_dataset.iloc[king_test_idx, :]

# Metadata For the CTGAN Synhtesizer

In [40]:
from sdv.metadata import SingleTableMetadata

insurance_metadata = SingleTableMetadata()
insurance_metadata.detect_from_dataframe(insurance_train)

king_metadata = SingleTableMetadata()
king_metadata.detect_from_dataframe(king_train)

In [10]:
from sdv.single_table import CTGANSynthesizer

insurance_synthesizer = CTGANSynthesizer(insurance_metadata, verbose=True)
insurance_synthesizer.fit(insurance_train)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Gen. (-0.78) | Discrim. (0.02): 100%|██████████| 300/300 [00:29<00:00, 10.02it/s] 


In [13]:
king_synthesizer = CTGANSynthesizer(king_metadata, verbose=True)
king_synthesizer.fit(king_train)

Gen. (-0.67) | Discrim. (-0.18): 100%|██████████| 300/300 [06:58<00:00,  1.39s/it]


In [11]:
from src.metrics.js_divergence import JSDivergence
from src.metrics.wasserstein_distance import WassersteinDistance

def evaluate_synnthetic_data(real_data: pd.DataFrame, synhtetic_data: pd.DataFrame, metadata):
    jsd = JSDivergence.compute(real_data, synhtetic_data, metadata)
    wd = WassersteinDistance.compute(real_data, synhtetic_data, metadata)
    
    return jsd, wd

In [33]:
insurance_synthetic_data = insurance_synthesizer.sample(len(insurance_dataset))

insurance_jsd, _ = evaluate_synnthetic_data(insurance_dataset, insurance_synthetic_data, insurance_metadata)

In [34]:
king_synthetic_data = king_synthesizer.sample(len(king_dataset))

king_jsd, _ = evaluate_synnthetic_data(king_dataset, king_synthetic_data, king_metadata)

In [35]:
np.average([insurance_jsd, king_jsd])

0.13827951352221646

In [41]:
from sdv.single_table import CTGANSynthesizer

insurance_synthesizer = CTGANSynthesizer(insurance_metadata, verbose=True)



In [49]:
insurance_synthesizer.fit_processed_data(insurance_train)

Gen. (-1.29) | Discrim. (-0.02): 100%|██████████| 300/300 [00:17<00:00, 16.71it/s]


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,46,female,29.756,0,yes,southeast,10537.672763
1,41,male,26.065,0,no,southeast,29772.024859
2,39,male,28.726,3,no,northwest,13277.919676
3,44,female,25.040,4,no,northeast,7071.493423
4,45,female,35.685,1,yes,southeast,34126.484358
...,...,...,...,...,...,...,...
95,64,male,30.176,2,no,northwest,6390.942452
96,50,male,37.035,5,no,southwest,15413.225633
97,18,female,39.365,0,yes,southeast,9575.497090
98,39,male,31.439,3,no,southwest,15856.577952
