## Données

In [9]:
import pandas as pd

variables = ['sex','age','agegr','placesize','edu','socprof','marital','ls','depress','trust','trustfam','trustneigh','sport','nofriend','smoke','alcabuse','alcsol','wkabint','englang','height','weight','bmi']
original = pd.read_csv('SDV/df_original.csv', names = variables)
sample = pd.read_csv('SDV/df1_sample.csv', names = variables)
cart = pd.read_csv('SDV/df1_cart.csv', names = variables)
ctree = pd.read_csv('SDV/df1_ctree.csv', names = variables)
parametric = pd.read_csv('SDV/df1_parametric.csv', names = variables)
rf = pd.read_csv('SDV/df1_rf.csv', names = variables)
bag = pd.read_csv('SDV/df1_bag.csv', names = variables)

In [28]:
original.head()

4603

## Génération des données

In [None]:
from sdv.single_table import CTGANSynthesizer

num = ['age', 'depress', 'nofriend', 'height', 'weight', 'bmi']
fac = ['sex', 'agegr', 'placesize', 'edu', 'socprof', 'marital',
       'ls', 'trust', 'trustfam', 'trustneigh', 'sport', 'smoke',
       'alcabuse', 'alcsol', 'wkabint', 'englang']

ctgan = CTGANSynthesizer(metadata)
ctgan.fit(original)

In [48]:
import time

start_time = time.time()
synthetic_data = ctgan.sample(len(original))
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Temps pris pour générer les données synthétiques : {elapsed_time} secondes")

KeyboardInterrupt: 

## Diagnostique

### Risque

In [42]:
from sdv.evaluation.single_table import run_diagnostic
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(original)

diagnostic = run_diagnostic(
    real_data = original,
    synthetic_data = cart,
    metadata = metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 22/22 [00:00<00:00, 464.00it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 182.48it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



### Utilité

In [44]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data = original,
    synthetic_data = cart,
    metadata = metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 22/22 [00:00<00:00, 198.94it/s]|
Column Shapes Score: 99.24%

(2/2) Evaluating Column Pair Trends: |██████████| 231/231 [00:07<00:00, 32.87it/s]|
Column Pair Trends Score: 97.28%

Overall Score (Average): 98.26%



## Graphiques

In [53]:
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_plot(
    real_data = original,
    synthetic_data = cart,
    column_name = 'sex',
    metadata = metadata
)
fig.show()

fig1 = get_column_pair_plot(
    real_data = original,
    synthetic_data = cart,
    column_names = ['sex', 'socprof'],
    metadata = metadata
)
fig1.show()