In [71]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")

# Necessary for notebook to see src package
import sys
sys.path.append('../..')

Device: cuda


# Load The Dataset

In [72]:
loan_dataset_path = data_folder / 'Loan.csv'
loan_dataset = pd.read_csv(loan_dataset_path)

insurance_dataset_path = data_folder / 'insurance.csv'
insurance_dataset = pd.read_csv(insurance_dataset_path)

In [73]:
loan_dataset

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


# Train Test Split

In [74]:
from sklearn.model_selection import train_test_split


def train_test_validation_split(
    X: pd.DataFrame,
    y: pd.DataFrame,
    test_ratio: float = 0.2,
    val_ratio: float = 0.2,
    seed = 0,
    stratify: bool = False
) -> tuple[pd.DataFrame]:
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_ratio, shuffle=True, random_state=seed, stratify=y if stratify else None
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=test_ratio, shuffle=True, random_state=seed, stratify=y_train if stratify else None
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

def concatanate_features_and_target(
    X: pd.DataFrame,
    y: pd.DataFrame
) -> pd.DataFrame:
    
    frame = pd.concat([X, y], axis=1)
    return frame

In [75]:
# Set apart the features and target

loan_features, loan_target = loan_dataset.drop(columns=['Personal Loan']), loan_dataset['Personal Loan']
insurance_features, insurance_target = insurance_dataset.iloc[:, :-1], insurance_dataset.iloc[:, -1]

# Get the train val test sets
insurance_features_train, insurance_features_val, insurance_features_test, insurance_target_train, \
insurance_target_val, insurance_target_test  = train_test_validation_split(
    insurance_features, 
    insurance_target
)

# Concatante features and targets for GAN training
insurance_train = concatanate_features_and_target(insurance_features_train, insurance_target_train)
insurance_val = concatanate_features_and_target(insurance_features_val, insurance_target_val)

In [76]:
from sdv.single_table.ctgan import CTGANSynthesizer
from sdv.metadata.single_table import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(insurance_dataset)

synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(insurance_dataset)



In [77]:
from src.metrics.js_divergence import JSDivergence                  # Discrete Columns
from src.metrics.wasserstein_distance import WassersteinDistance    # Continuous Columns
from src.metrics.propensity_score import PropensityScore            # All Columns
from src.metrics.pcorr_difference import PairwiseCorrelationDifference

In [None]:
synthetic_data = synthesizer.sample(len(insurance_dataset))

js_metric = JSDivergence.compute(insurance_dataset, synthetic_data, metadata)
wd_metric = WassersteinDistance.compute(insurance_dataset, synthetic_data, metadata)
propensity_score = PropensityScore.compute(insurance_dataset, synthetic_data, metadata)
pcorr_difference = PairwiseCorrelationDifference.compute(insurance_dataset, synthetic_data, metadata)

print(f'js_metric: {js_metric}')
print(f'wd_metric: {wd_metric}')
print(f'propensity_score: {propensity_score}')
print(f'pcorr_difference: {pcorr_difference}')

js_metric: 0.0647546071539449
wd_metric: 0.11911054535700788
propensity_score: 0.05060411431975033
pcorr_difference: 0.652155824203722


In [None]:
insurance_dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [None]:
insurance_numpy_folder = data_folder / 'yandex_paper_data' / 'insurance'


856

In [None]:
X_cat_test = np.load(insurance_numpy_folder / 'X_cat_test.npy', allow_pickle=True)
X_cat_train = np.load(insurance_numpy_folder / 'X_cat_train.npy', allow_pickle=True)
X_cat_val = np.load(insurance_numpy_folder / 'X_cat_val.npy', allow_pickle=True)

In [29]:
X_cat_val.shape

(214, 3)