In [2]:
import itertools
import yaml
from yaml.loader import SafeLoader
import mlflow
import mlflow.pyfunc
import joblib
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from sdv.evaluation.single_table import run_diagnostic
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot
from sdv.evaluation.single_table import evaluate_quality
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

os.chdir('/home/onyxia/work/synthetic-data-sdc/')
os.environ["MLFLOW_TRACKING_URI"] = "https://projet-donnees-synthetiques-mlflow.user.lab.sspcloud.fr/" 

In [3]:
variables = ['sex','age','agegr','placesize','edu','socprof','marital','ls','depress','trust','trustfam','trustneigh','sport','nofriend','smoke','alcabuse','alcsol','wkabint','englang','height','weight','bmi']

In [4]:
original = pd.read_csv('SDV/df_original.csv', names = variables)

In [5]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(original)

In [9]:
with open("SDV/config_ctgan.yml") as f:
    config = yaml.load(f, Loader=SafeLoader)

class CTGANWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self):
        self.model = None

    def load_context(self, context):
        self.model = joblib.load(context.artifacts["model_path"])

    def predict(self, context, model_input):
        return self.model.sample(len(model_input))


In [None]:
start_time = time.time()
ctgan = CTGANSynthesizer(metadata, 
                         verbose=False,
                         discriminator_lr=config['discriminator_lr'],
                         generator_lr=config['generator_lr'],
                         batch_size=config['batch_size'],
                         epochs = config['epochs'],
                         cuda = True)
ctgan.fit(original)
end_time = time.time()



In [None]:
ctgan.get_loss_values()

In [None]:
remote_server_uri = os.environ["MLFLOW_TRACKING_URI"]
experiment_name = 'Default'
run_name = 'ctgan_sd2011_dlr' + str(config['discriminator_lr']) + '_glr' + str(config['generator_lr']) + '_b' + str(config['batch_size']) + '_e' + str(config['epochs']) + '_gpu'

mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name=run_name):
    for param_key, param_value in config.items():
        if param_key != 'mlflow':
            mlflow.log_param(param_key, param_value)
    mlflow.log_param('run_name', run_name)

    #Entraînement
    start_time = time.time()
    ctgan = CTGANSynthesizer(metadata, 
                            verbose=False,
                            discriminator_lr=config['discriminator_lr'],
                            generator_lr=config['generator_lr'],
                            batch_size=config['batch_size'],
                            epochs = config['epochs'])
    ctgan.fit(original)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    mlflow.log_metric("elapsed_time", elapsed_time)
    print(f"Temps pris entraîner le modèle : {elapsed_time} secondes")
    
    params_tvae = ctgan.get_parameters()
   

    # Log des paramètres du modèle
    for param_key, param_value in params_tvae.items():
        with mlflow.start_run(nested=True):
            mlflow.log_param(param_key, param_value)

    loss_by_epoch = ctgan.get_loss_values().groupby(['Epoch'])['Loss'].mean().tolist()
    for epoch, loss in enumerate(loss_by_epoch):
        mlflow.log_metric("loss", loss, step=epoch+1)

    model_path = run_name + ".pkl"
    joblib.dump(ctgan, model_path)
    mlflow.pyfunc.log_model(
        artifact_path="ctgan_model",
        python_model=CTGANWrapper(),
        artifacts={"model_path": model_path},
    )
    mlflow.log_artifact("SDV/config_ctgan.yml", artifact_path = "config_model")
    
mlflow.end_run()

## Variabilité du modèle

In [None]:
with open("SDV/config_ctgan.yml") as f:
    config = yaml.load(f, Loader=SafeLoader)

In [None]:
remote_server_uri = os.environ["MLFLOW_TRACKING_URI"]
experiment_name = 'Default'

all_loss = {'epoch': [i for i in range(config['epochs'])]}

for i in range(100):
    if (i+1)%10 == 0:
        print(i+1)
    run_name = 'ctgan_sd2011_dlr' + str(config['discriminator_lr']) + '_glr' + str(config['generator_lr']) + '_b' + str(config['batch_size']) + '_e' + str(config['epochs']) + '_iter' + str(i) + '_gpu'
    
    mlflow.set_tracking_uri(remote_server_uri)
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run(run_name=run_name):
        for param_key, param_value in config.items():
            if param_key != 'mlflow':
                mlflow.log_param(param_key, param_value)
        mlflow.log_param('run_name', run_name)
    
        #Entraînement
        start_time = time.time()
        ctgan = CTGANSynthesizer(metadata, 
                                 verbose=False,
                                 discriminator_lr=config['discriminator_lr'],
                                 generator_lr=config['generator_lr'],
                                 batch_size=config['batch_size'],
                                 epochs = config['epochs'])
        ctgan.fit(original)
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        mlflow.log_metric("elapsed_time", elapsed_time)
        print(f"Temps pris entraîner le modèle : {elapsed_time} secondes")
        
        params_ctgan = ctgan.get_parameters()

        loss_by_epoch = ctgan.get_loss_values().groupby(['Epoch'])['Loss'].mean().tolist()
        nom_iter = 'iter'+str(i)
        all_loss[nom_iter] = loss_by_epoch
       
        # Log des paramètres du modèle
        for param_key, param_value in params_tvae.items():
            with mlflow.start_run(nested=True):
                mlflow.log_param(param_key, param_value)    
        
        for epoch, loss in enumerate(loss_by_epoch):
            mlflow.log_metric("loss", loss, step=epoch+1)
    
        model_path = run_name + ".pkl"
        joblib.dump(ctgan, model_path)
        mlflow.pyfunc.log_model(
            artifact_path="ctgan_model",
            python_model=CTGANWrapper(),
            artifacts={"model_path": model_path},
        )
        mlflow.log_artifact("SDV/config_ctgan.yml", artifact_path = "config_model")
        
    mlflow.end_run()