In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")
dataset_location = data_folder / 'Loan' /"data.csv"

# Necessary for notebook to see src
import sys
sys.path.append('../..')

from src.visualization import plot_corelation_matrix, plot_continuous_columns

Device: cuda


# Load Dataset


In [26]:
original_dataset = pd.read_csv(dataset_location)
dataset = original_dataset.copy(deep=True)

dataset.drop(columns='ID', inplace=True)

train_indices = np.load(data_folder / 'Loan' / 'train_idx.npy')

train_set = dataset.iloc[train_indices]
train_set

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
408,60,36,89,91745,2,2.8,1,0,0,0,0,1,0
3737,44,19,30,91423,1,0.5,3,0,0,0,0,1,0
3889,26,0,19,93014,1,0.1,2,121,0,0,0,1,0
568,34,9,41,92101,2,0.1,1,161,0,0,0,1,1
1041,56,32,51,92780,4,1.5,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4587,37,11,59,94720,4,0.2,3,0,0,0,0,0,0
2341,36,10,91,92028,1,1.5,3,289,0,0,0,1,0
1585,57,31,131,90502,2,2.7,1,0,0,0,0,0,0
3152,40,15,83,90275,1,1.0,3,0,0,0,0,0,0


In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 5000 non-null   int64  
 1   Experience          5000 non-null   int64  
 2   Income              5000 non-null   int64  
 3   ZIP Code            5000 non-null   int64  
 4   Family              5000 non-null   int64  
 5   CCAvg               5000 non-null   float64
 6   Education           5000 non-null   int64  
 7   Mortgage            5000 non-null   int64  
 8   Personal Loan       5000 non-null   int64  
 9   Securities Account  5000 non-null   int64  
 10  CD Account          5000 non-null   int64  
 11  Online              5000 non-null   int64  
 12  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(12)
memory usage: 507.9 KB


# Detect Metadata

In [31]:
from src.evaluation.synthesizer_evaluation import SynthesizerEvaluation
from sdv.metadata.single_table import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_set)

# Import Relevant Libraries

In [32]:
from src.metrics.js_divergence import JSDivergence                  # Discrete Columns
from src.metrics.wasserstein_distance import WassersteinDistance    # Continuous Columns
from src.metrics.propensity_score import PropensityScore            # All Columns
from src.metrics.pcorr_difference import PairwiseCorrelationDifference
import optuna

# Objective

In [35]:
from sdv.single_table import CTGANSynthesizer
from src.synthesizers import save_model
from src.schemas.trial_info import TrialInfo, SynthesizerHyperParameters


def objective(trial: optuna.Trial):
    discriminator_steps = trial.suggest_int('discriminator_steps', 1, 5)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-7, 5e-4, log=True)
    epochs = trial.suggest_int('epochs', 300, 2000, 10)
    
    synthesizer = CTGANSynthesizer(
        metadata, 
        generator_lr=lr, 
        discriminator_steps=discriminator_steps,
        generator_decay=weight_decay,
        discriminator_decay=weight_decay,
        epochs=epochs, 
        discriminator_lr=lr,
        verbose=False,
    )

    synthesizer.fit(train_set)

    synthetic_data = synthesizer.sample(len(train_set))

    js_metric = JSDivergence.compute(dataset, synthetic_data, metadata)
    wd_metric = WassersteinDistance.compute(dataset, synthetic_data, metadata)
    propensity_score = PropensityScore.compute(dataset, synthetic_data, metadata)
    pcorr_difference = PairwiseCorrelationDifference.compute(dataset, synthetic_data, metadata)

    file_path = save_model(synthesizer, 'CTGAN', 'Loan', folder_path='../../models')

    trial.set_user_attr('file_path', file_path)
    trial.set_user_attr('JSD', js_metric)
    trial.set_user_attr('WD', wd_metric)
    trial.set_user_attr('PS', propensity_score)
    trial.set_user_attr('PCD', pcorr_difference)

 
    return propensity_score

# STUDY

In [36]:
study = optuna.study.create_study(
    directions=['minimize']
)
study.optimize(objective, n_trials=10, n_jobs=1)

[I 2024-12-10 12:27:15,237] A new study created in memory with name: no-name-6518ba15-6ffb-4bef-9fdc-92438aff5fe7
[I 2024-12-10 12:32:06,899] Trial 0 finished with value: 0.009488496155547305 and parameters: {'discriminator_steps': 5, 'lr': 2.2265571681533632e-05, 'weight_decay': 2.2949210971233803e-07, 'epochs': 660}. Best is trial 0 with value: 0.009488496155547305.
[I 2024-12-10 12:35:34,597] Trial 1 finished with value: 0.00977718066631473 and parameters: {'discriminator_steps': 3, 'lr': 4.4375759499831246e-05, 'weight_decay': 1.2988101421637532e-06, 'epochs': 740}. Best is trial 0 with value: 0.009488496155547305.
[I 2024-12-10 12:36:52,212] Trial 2 finished with value: 0.014313264908747937 and parameters: {'discriminator_steps': 1, 'lr': 2.9732190380148477e-05, 'weight_decay': 4.677600438239474e-06, 'epochs': 630}. Best is trial 0 with value: 0.009488496155547305.
[I 2024-12-10 12:45:34,265] Trial 3 finished with value: 0.010405759214664264 and parameters: {'discriminator_steps':

In [37]:
trials_made = study.trials_dataframe(attrs=('params', 'value', 'user_attrs'), multi_index=True)
trials_made.to_csv('trial_info.csv', index=False)

In [38]:
study.best_trials

[FrozenTrial(number=0, state=1, values=[0.009488496155547305], datetime_start=datetime.datetime(2024, 12, 10, 12, 27, 15, 239905), datetime_complete=datetime.datetime(2024, 12, 10, 12, 32, 6, 899721), params={'discriminator_steps': 5, 'lr': 2.2265571681533632e-05, 'weight_decay': 2.2949210971233803e-07, 'epochs': 660}, user_attrs={'file_path': '../../models/Loan/CTGAN/730105a0c8b640e385ff1ccafa9941d41e12b147275fd189c7f6fd092ec2f02f.pth', 'JSD': 0.04582870784098936, 'WD': 0.023450659423581564, 'PS': 0.009488496155547305, 'PCD': 1.4859128246235938}, system_attrs={}, intermediate_values={}, distributions={'discriminator_steps': IntDistribution(high=5, log=False, low=1, step=1), 'lr': FloatDistribution(high=0.01, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.0005, log=True, low=1e-07, step=None), 'epochs': IntDistribution(high=2000, log=False, low=300, step=10)}, trial_id=0, value=None)]

In [39]:
trials_made

Unnamed: 0_level_0,params,params,params,params,value,user_attrs,user_attrs,user_attrs,user_attrs,user_attrs
Unnamed: 0_level_1,discriminator_steps,epochs,lr,weight_decay,Unnamed: 5_level_1,JSD,PCD,PS,WD,file_path
0,5,660,2.2e-05,2.294921e-07,0.009488,0.045829,1.485913,0.009488,0.023451,../../models/Loan/CTGAN/730105a0c8b640e385ff1c...
1,3,740,4.4e-05,1.29881e-06,0.009777,0.03767,0.922413,0.009777,0.015199,../../models/Loan/CTGAN/a4b146d010053f5c67a1bf...
2,1,630,3e-05,4.6776e-06,0.014313,0.046643,1.626585,0.014313,0.032248,../../models/Loan/CTGAN/3bbfe072a83db1d2da0c7b...
3,5,1180,5.1e-05,2.832857e-05,0.010406,0.045096,0.744876,0.010406,0.021257,../../models/Loan/CTGAN/44a991f63c6b035955d79a...
4,3,1630,4.2e-05,5.085359e-05,0.011755,0.048857,0.580312,0.011755,0.034785,../../models/Loan/CTGAN/77c9f3f41352af0a064c4e...
5,1,800,4.4e-05,1.489318e-06,0.013004,0.050012,1.376153,0.013004,0.0412,../../models/Loan/CTGAN/f5ce8d4d2ce1b4aa3ec155...
6,2,1850,0.002184,8.278742e-05,0.011558,0.044318,0.565066,0.011558,0.023078,../../models/Loan/CTGAN/db4945d88770ca0a5c8608...
7,1,1650,0.000833,3.574327e-05,0.016729,0.062114,0.405223,0.016729,0.022991,../../models/Loan/CTGAN/efa1352b872fef925e9e06...
8,3,970,0.000254,0.0003545518,0.023931,0.065214,0.373059,0.023931,0.017939,../../models/Loan/CTGAN/c5679926123f447184c82f...
9,5,1890,1.6e-05,8.71112e-05,0.010543,0.045358,0.867057,0.010543,0.014063,../../models/Loan/CTGAN/4bb697b9126bc8ea10a23c...
