In [1]:
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

os.chdir('/home/onyxia/work/synthetic-data-sdc/')
os.environ["MLFLOW_TRACKING_URI"] = "https://projet-donnees-synthetiques-mlflow.user.lab.sspcloud.fr/"

# Données

In [2]:
variables = ['sex','age','agegr','placesize','edu','socprof','marital','ls','depress','trust','trustfam','trustneigh','sport','nofriend','smoke','alcabuse','alcsol','wkabint','englang','height','weight','bmi']

var_puf65 = ["AAC","ACTEU","AGE6","ANCCHOM","ANCEMPL4","ANCSSEMP","CHPUB","CL_EMPLOI","COUPL_LOG","DEMNE","DIP7","DISPONE","DISPPLC","ENFRED","EXTRIAN","HALOR","HEFFEMP","HEFFTOT",
           "HHABEMP","HHABTOT","METRODOM","MRANE","MRBNE","MRCNE","MRDNE","MRENE","MRFNE","MRGNE","MRHNE","MRINE","MRJNE","NAFANTG004N","NAFG004UN","NAFG010UN","NAFG017UN",
           "NAFG021UN","NOI","OFFICC","PASTRA","PASTRB","PCS1","PCS1Q","PUB3FP","RABS","RAISDISPPLC","RAISNDISPONE","RAISNRECNE","RAISNSOUNE","RAISTP","RECNE","SALTYP","SEXE",
           "SOU_C","SOUSEMPL","SOUSEMPLR","STATUT","STATUTDET","STC","STCOMM2020","STPLC","TEMP","TPPRED","TRAREF","TXTPPRED","TYPLOG5"]

fac_puf71 = ["AAC","ACTEU","AGE6","ANCCHOM","ANCEMPL4","ANCSSEMP","CHPUB","CL_EMPLOI","COUPL_LOG","DEMNE","DIP7","DISPONE","DISPPLC","ENFRED","HALOR","IDENT","ISCO2","METRODOM","MRANE","MRBNE","MRCNE","MRDNE","MRENE","MRFNE","MRGNE","MRHNE","MRINE","MRJNE","NAFANTG004N","NAFANTG088N","NAFG004UN","NAFG010UN","NAFG017UN",
           "NAFG021UN","NAFG038UN","NAFG088UN","NOI","OFFICC","PASTRA","PASTRB","PCS1","PCS1Q","PCS2","PUB3FP","RABS","RAISDISPPLC","RAISNDISPONE","RAISNRECNE","RAISNSOUNE","RAISTP","RECNE","SALTYP","SEXE",
           "SOU_C","SOUSEMPL","SOUSEMPLR","STATUT","STATUTDET","STC","STCOMM2020","STPLC","TEMP","TPPRED","TRAREF","TXTPPRED","TYPLOG5"]

var_puf71 = ["AAC","ACTEU","AGE6","ANCCHOM","ANCEMPL4","ANCSSEMP","CHPUB","CL_EMPLOI","COUPL_LOG","DEMNE","DIP7","DISPONE","DISPPLC","ENFRED","EXTRIAN","HALOR","HEFFEMP","HEFFTOT",
           "HHABEMP","HHABTOT","IDENT","ISCO2","METRODOM","MRANE","MRBNE","MRCNE","MRDNE","MRENE","MRFNE","MRGNE","MRHNE","MRINE","MRJNE","NAFANTG004N","NAFANTG088N","NAFG004UN","NAFG010UN","NAFG017UN",
           "NAFG021UN","NAFG038UN","NAFG088UN","NOI","OFFICC","PASTRA","PASTRB","PCS1","PCS1Q","PCS2","PUB3FP","RABS","RAISDISPPLC","RAISNDISPONE","RAISNRECNE","RAISNSOUNE","RAISTP","RECNE","SALTYP","SEXE",
           "SOU_C","SOUSEMPL","SOUSEMPLR","STATUT","STATUTDET","STC","STCOMM2020","STPLC","TEMP","TPPRED","TRAREF","TXTPPRED","TYPLOG5"]


original = pd.read_csv('SDV/df_original.csv', names = variables)
puf65 = pd.read_csv('TableEvaluator/puf.csv', names = var_puf65)


  puf65 = pd.read_csv('TableEvaluator/puf.csv', names = var_puf65)


# Metadata

In [3]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(puf65)

# Simulations

In [4]:
import itertools
import yaml
from yaml.loader import SafeLoader
import mlflow
import mlflow.pyfunc
import joblib

In [None]:
with open("SDV/config.yml") as f:
    config = yaml.load(f, Loader=SafeLoader)

class TVAEWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self):
        self.model = None

    def load_context(self, context):
        self.model = joblib.load(context.artifacts["model_path"])

    def predict(self, context, model_input):
        return self.model.sample(len(model_input))

remote_server_uri = os.environ["MLFLOW_TRACKING_URI"]
experiment_name = 'Default'
run_name = 'sim_puf_tvae'

mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment(experiment_name)

for i in range(20):
    with mlflow.start_run(run_name=run_name):
        mlflow.log_param('run_name', run_name)
    
        #Entraînement
        start_time = time.time()
        tvae = TVAESynthesizer(metadata, verbose=True)
        tvae.fit(puf65)
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        mlflow.log_metric("elapsed_time", elapsed_time)
        print(f"Temps pris entraîner le modèle : {elapsed_time} secondes")
    
        puf65_tvae.to_csv(f"SDV/puf65_tvae_{i}.csv")
        
        params_tvae = tvae.get_parameters()
    
        # Log des paramètres du modèle
        for param_key, param_value in params_tvae.items():
            with mlflow.start_run(nested=True):
                mlflow.log_param(param_key, param_value)
    
        loss = tvae.get_loss_values()
        losses = loss['Loss'].tolist()
        for epoch, loss in enumerate(losses):
            mlflow.log_metric("loss", loss, step=epoch)
    
        model_path = "tvae_model.pkl"
        joblib.dump(tvae, model_path)
        mlflow.pyfunc.log_model(
            artifact_path="tvae_model",
            python_model=TVAEWrapper(),
            artifacts={"model_path": model_path},
        )
    mlflow.end_run()

Loss: 31.427:  11%|█▏        | 34/300 [19:16<2:56:35, 39.83s/it]