In [1]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

In [2]:
real_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,27 Dec 2020,29 Dec 2020,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,30 Dec 2020,02 Jan 2021,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,28 Dec 2020,31 Dec 2020,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,05 Apr 2020,,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


### Fonction permettant de connaître les modalités de chaques variables

In [83]:
import numpy as np
import pandas as pd

def get_statistics(dataframe):
    stats_list = []
    
    for column in dataframe.columns:
        if np.issubdtype(dataframe[column].dtype, np.number):
            stats = {
                'variable': column,
                'min': dataframe[column].min(),
                'max': dataframe[column].max(),
                'mediane': dataframe[column].median(),
                'moyenne': dataframe[column].mean(),
                'std': dataframe[column].std(),
                'cv': dataframe[column].std() / dataframe[column].mean() if dataframe[column].mean() != 0 else np.nan,
                'q025': dataframe[column].quantile(0.025),
                'q1': dataframe[column].quantile(0.25),
                'q3': dataframe[column].quantile(0.75),
                'q975': dataframe[column].quantile(0.975)
            }
        else:
            value_counts = dataframe[column].value_counts()
            stats = {
                'variable': column,
                'nb_modalites': value_counts.size,
                'plus_freq': value_counts.idxmax(),
                'plus_freq_eff': value_counts.max(),
                'moins_freq': value_counts.idxmin(),
                'moins_freq_eff': value_counts.min()
            }
        
        stats_list.append(stats)
    
    stats_df = pd.DataFrame(stats_list)
    return stats_df

In [85]:
get_statistics(real_data)

Unnamed: 0,variable,nb_modalites,plus_freq,plus_freq_eff,moins_freq,moins_freq_eff,min,max,mediane,moyenne,std,cv,q025,q1,q3,q975
0,guest_email,500.0,danieltaylor@harper.com,1.0,danieltaylor@harper.com,1.0,,,,,,,,,,
1,has_rewards,2.0,False,447.0,True,53.0,,,,,,,,,,
2,room_type,3.0,BASIC,384.0,SUITE,38.0,,,,,,,,,,
3,amenities_fee,,,,,,0.0,48.12,18.59,18.17607,11.01881,0.606226,0.0,10.42,25.57,38.6065
4,checkin_date,265.0,13 Oct 2020,7.0,22 Mar 2020,1.0,,,,,,,,,,
5,checkout_date,275.0,15 Oct 2020,5.0,25 Mar 2020,1.0,,,,,,,,,,
6,room_rate,,,,,,83.8,424.84,130.11,152.5894,56.30475,0.368995,101.5665,112.8925,180.8875,294.8355
7,billing_address,230.0,"77 Massachusetts Ave\nCambridge, MA 02139",53.0,"06567 John Walk Apt. 238\nWest Scott, NV 61617",1.0,,,,,,,,,,
8,credit_card_number,,,,,,501800400000.0,4.985421e+18,2384672000000000.0,4.330974e+17,1.312004e+18,3.029351,563440600000.0,30454670000000.0,4509813000000000.0,4.642584e+18


## Création du synthétiseur

In [4]:
from sdv.single_table import CTGANSynthesizer

synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(real_data)

## Génération des données synthétiques

In [91]:
synthetic_data = synthesizer.sample(num_rows = 500)
synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,tsanchez@example.com,False,DELUXE,0.0,05 Jan 2020,07 Jan 2020,251.74,"52424 Ashley Ridges\nLake Daniel, MP 27650",3582077138450885
1,bellshawn@example.com,False,DELUXE,2.93,04 Nov 2020,01 Mar 2020,304.38,"18561 Thomas Canyon\nJoshuamouth, SD 22073",4142271383722418
2,iwhite@example.org,False,BASIC,15.02,21 Oct 2020,07 Jul 2020,156.19,"78944 Marie Harbor\nCynthiaton, IA 56020",6573028438398211
3,christophermiller@example.com,False,SUITE,13.73,14 Mar 2020,10 Aug 2020,136.59,"47551 Hall Flats Apt. 315\nSouth James, WA 16993",30343480880655
4,dgarcia@example.org,True,DELUXE,11.12,05 Jan 2020,08 Mar 2020,135.03,"38378 Nicholas Mount\nWest Michael, CO 91475",4930915359735


In [92]:
get_statistics(synthetic_data)

Unnamed: 0,variable,nb_modalites,plus_freq,plus_freq_eff,moins_freq,moins_freq_eff,min,max,mediane,moyenne,std,cv,q025,q1,q3,q975
0,guest_email,500.0,randallcruz@example.net,1.0,randallcruz@example.net,1.0,,,,,,,,,,
1,has_rewards,2.0,False,355.0,True,145.0,,,,,,,,,,
2,room_type,3.0,BASIC,259.0,SUITE,103.0,,,,,,,,,,
3,amenities_fee,,,,,,0.0,48.12,3.46,7.860763,9.856661,1.253906,0.0,0.0,13.365,32.6495
4,checkin_date,249.0,05 Jan 2020,63.0,03 Mar 2020,1.0,,,,,,,,,,
5,checkout_date,230.0,07 Jan 2020,85.0,17 Jan 2020,1.0,,,,,,,,,,
6,room_rate,,,,,,83.8,424.84,151.365,180.201,67.74494,0.375941,103.5455,135.21,226.375,356.2062
7,billing_address,500.0,"61298 Gibson Manor\nHannahville, FM 54615",1.0,"61298 Gibson Manor\nHannahville, FM 54615",1.0,,,,,,,,,,
8,credit_card_number,,,,,,60432460000.0,4.984532e+18,3539338000000000.0,3.882559e+17,1.276814e+18,3.288589,570355700000.0,180041200000000.0,4902626000000000.0,4.76822e+18


Sur cette simple synthétisation, on constate que le synthétiseur ne crée pas de valeurs numériques en dehors de l'étendue originale.

## Diagnostique

### Risque

In [93]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data = real_data,
    synthetic_data = synthetic_data,
    metadata = metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 9/9 [00:00<00:00, 935.07it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 420.02it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



### Utilité

In [94]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 903.34it/s]|
Column Shapes Score: 72.53%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 231.14it/s]|
Column Pair Trends Score: 72.95%

Overall Score (Average): 72.74%



## Graphiques

In [100]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data = real_data,
    synthetic_data = synthetic_data,
    column_name = 'room_type',
    metadata = metadata
)
fig.show()