## Données

In [9]:
import pandas as pd

variables = ['sex','age','agegr','placesize','edu','socprof','marital','ls','depress','trust','trustfam','trustneigh','sport','nofriend','smoke','alcabuse','alcsol','wkabint','englang','height','weight','bmi']
original = pd.read_csv('SDV/df_original.csv', names = variables)
sample = pd.read_csv('SDV/df1_sample.csv', names = variables)
cart = pd.read_csv('SDV/df1_cart.csv', names = variables)
ctree = pd.read_csv('SDV/df1_ctree.csv', names = variables)
parametric = pd.read_csv('SDV/df1_parametric.csv', names = variables)
rf = pd.read_csv('SDV/df1_rf.csv', names = variables)
bag = pd.read_csv('SDV/df1_bag.csv', names = variables)

In [28]:
original.head()

4603

## Génération des données

In [39]:
import time
from ctgan import CTGAN

num = ['age', 'depress', 'nofriend', 'height', 'weight', 'bmi']
fac = ['sex', 'agegr', 'placesize', 'edu', 'socprof', 'marital',
       'ls', 'trust', 'trustfam', 'trustneigh', 'sport', 'smoke',
       'alcabuse', 'alcsol', 'wkabint', 'englang']

ctgan = CTGAN(epochs = 10)
ctgan.fit(original, fac)

start_time = time.time()
synthetic_data = ctgan.sample(len(original))
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Temps pris pour générer les données synthétiques : {elapsed_time} secondes")


Temps pris pour générer les données synthétiques : 39.928112268447876 secondes


## Diagnostique

### Risque

In [26]:
from sdv.evaluation.single_table import run_diagnostic

metadata = pd.DataFrame({
    "columns": {
        "sex": {"type": "categorical"},
        "age": {"type": "numerical"},
        "agegr": {"type": "categorical"},
        "placesize": {"type": "categorical"},
        "edu": {"type": "categorical"},
        "socprof": {"type": "categorical"},
        "marital": {"type": "categorical"},
        "ls": {"type": "categorical"},
        "depress": {"type": "numerical"},
        "trust": {"type": "categorical"},
        "trustfam": {"type": "categorical"},
        "trustneigh": {"type": "categorical"},
        "sport": {"type": "categorical"},
        "nofriend": {"type": "numerical"},
        "smoke": {"type": "categorical"},
        "alcabuse": {"type": "categorical"},
        "alcsol": {"type": "categorical"},
        "wkabint": {"type": "categorical"},
        "englang": {"type": "categorical"},
        "height": {"type": "numerical"},
        "weight": {"type": "numerical"},
        "bmi": {"type": "numerical"},
    }
})

diagnostic = run_diagnostic(
    real_data = original,
    synthetic_data = cart,
    metadata = original
)

TypeError: to_dict() takes from 1 to 2 positional arguments but 4 were given

### Utilité

In [94]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 903.34it/s]|
Column Shapes Score: 72.53%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 231.14it/s]|
Column Pair Trends Score: 72.95%

Overall Score (Average): 72.74%



## Graphiques

In [100]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data = real_data,
    synthetic_data = synthetic_data,
    column_name = 'room_type',
    metadata = metadata
)
fig.show()