#Install requirements

In [None]:
!pip install sdv

In [None]:
!pip install sdmetrics

In [None]:
!pip install kaleido

In [None]:
%pip install pandas==1.5.2

In [None]:
!pip install plotly==5.10.0

#Acces to Google Drive files for use on colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Imports

In [12]:
import pandas as pd

# Models
from sdv import Metadata
from sdv.relational import HMA1
from sdv.tabular import CTGAN

# Saving/Loading models
import pickle

# Constraints
from sdv.constraints import FixedCombinations, Unique, Inequality

# Metrics
from sdmetrics.reports.multi_table import DiagnosticReport
from sdmetrics.reports.multi_table import QualityReport
from sdv.metrics.relational import KSComplement
from sdmetrics.multi_table import CardinalityShapeSimilarity
from sdv.evaluation import evaluate
from sdmetrics.reports.multi_table import DiagnosticReport
from sdmetrics.reports.multi_table import QualityReport
from sdmetrics.reports import utils
from sdmetrics.column_pairs import ContingencySimilarity

# Timings
import time
import datetime

#Load data

In [4]:
# Load multi-table dataset
customers = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/customers.csv")
order_items = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/order_items.csv")
order_statuses = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/order_statuses.csv")
orders = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/orders_dummy.csv", sep=";")
products = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/products.csv")
shippers = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/shippers_dummy.csv")

# Ensure datetime format
orders['order_date'] = pd.to_datetime(orders['order_date'])
orders['shipped_date'] = pd.to_datetime(orders['shipped_date'])

In [5]:
# create dictionnary with tables: used for evaluating data
tables = dict(customers=customers, order_items=order_items, order_statuses=order_statuses, orders=orders, products=products, shippers=shippers)

#Load model

In [6]:
# Load your model
model_multi = pd.read_pickle("/content/drive/MyDrive/PTUT - Synthetic/multi_table_model.pkl")

#Generate new samples

In [9]:
# Generate new data
new_data = model_multi.sample(num_rows=5000)

  table_rows[name] = table_rows[name].dropna().astype(dtype)


# GENERATE REPORT FOR THE MULTI MODEL

In [10]:
metadata = model_multi.metadata
metadata_dict = metadata.to_dict()
metadata_dict

{'tables': {'customers': {'fields': {'address': {'type': 'categorical',
     'pii': True,
     'pii_category': 'street_address'},
    'phone': {'type': 'categorical',
     'pii': True,
     'pii_category': 'phone_number'},
    'first_name': {'type': 'categorical',
     'pii': True,
     'pii_category': 'first_name'},
    'last_name': {'type': 'categorical',
     'pii': True,
     'pii_category': 'last_name'},
    'birth_date': {'type': 'categorical',
     'pii': True,
     'pii_category': 'date_of_birth'},
    'customer_id': {'type': 'id', 'subtype': 'integer'},
    'city': {'type': 'categorical'},
    'state': {'type': 'categorical'},
    'points': {'type': 'numerical', 'subtype': 'integer'}},
   'constraints': [{'constraint': 'sdv.constraints.tabular.FixedCombinations',
     'column_names': ['city', 'state']}],
   'primary_key': 'customer_id'},
  'orders': {'fields': {'order_id': {'type': 'id', 'subtype': 'integer'},
    'customer_id': {'type': 'id',
     'subtype': 'integer',
     '

In [13]:
# Evaluate the new data
evaluate(new_data, tables, metadata_dict)

0.8781445023950375

In [None]:
#Diagnostic Report Gen
diag_report = DiagnosticReport()
diag_report.generate(tables, new_data, metadata_dict)

#Quality Report Gen
qual_report = QualityReport()
qual_report.generate(tables, new_data, metadata_dict)

#Save the reports
#qual_report.save(filepath='/content/drive/MyDrive/PTUT - Synthetic/Rapports/quality_report_multi.pkl')
#diag_report.save(filepath='/content/drive/MyDrive/PTUT - Synthetic/Rapports/diagnostic_report_multi.pkl')

In [7]:
#Load the report
report_diag = DiagnosticReport.load('/content/drive/MyDrive/PTUT - Synthetic/Rapports/diagnostic_report_multi.pkl')
report_qual = QualityReport.load('/content/drive/MyDrive/PTUT - Synthetic/Rapports/quality_report_multi.pkl')

In [8]:
#Quality Score 
Score = report_qual.get_score()
print("Score =", Score)
 
#Column Shapes
fig = report_qual.get_visualization(property_name='Column Shapes', table_name = 'orders')
fig.show()

#Column Pair Trends
fig2 = report_qual.get_visualization(property_name='Column Pair Trends', table_name = 'orders')
fig2.show()

#Save figures
fig.to_image('png')
fig2.to_image('png')

Score = 0.5959371894862516


b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03\x84\x00\x00\x03\x84\x08\x06\x00\x00\x00\xf9N\xff;\x00\x00 \x00IDATx^\xec\x9d\x07t\x15E\xdb\xc7\x1f\xbaHSD\xa4\x08\x8a\x80\xe0+\x16TD\x9a\xf4\xde{\xef\xbd$\xf4\x96\x04\x08-\xf4@\xe8\xbd\x85^C\xefE@\xa4\x89"\xa0(R\xa4H\x95"\xbd\x7f\xe7\x19\xbe{MB\xca\xcd\xdd\xb9wfw\xff{\xcew\xce\xf7\x86\x9d\x99g~\xcf$\xeeo\xa7l\xbc\x17/^\xbc \\ \x00\x02 \x00\x02 \x00\x02 \x00\x02 \x00\x02 \x00\x02\xb6#\x10\x0fBh\xbb\x9c\xa3\xc3 \x00\x02 \x00\x02 \x00\x02 \x00\x02 \x00\x02  \x08@\x081\x10@\x00\x04@\x00\x04@\x00\x04@\x00\x04@\x00\x04@\xc0\xa6\x04 \x846M<\xba\r\x02 \x00\x02 \x00\x02 \x00\x02 \x00\x02 \x00\x02\x10B\x8c\x01\x10\x00\x01\x10\x00\x01\x10\x00\x01\x10\x00\x01\x10\x00\x01\x10\xb0)\x01\x08\xa1M\x13\x8fn\x83\x00\x08\x80\x00\x08\x80\x00\x08\x80\x00\x08\x80\x00\x08\x80\x00\x84\x10c\x00\x04@\x00\x04@\x00\x04@\x00\x04@\x00\x04@\x00\x04lJ\x00Bh\xd3\xc4\xa3\xdb \x00\x02 \x00\x02 \x00\x02 \x00\x02 \x00\x02 \x00!\xc4\x18\x00\x01\x10\x00\x01\x10\x00\x01\x10\x0

In [9]:
#Diagnostic Score 
results = report_diag.get_results()
print("Resultats =", results)
 
#Synthesis
fig = report_diag.get_visualization( property_name='Synthesis',table_name = 'products')
fig.show()

#Coverage
fig2 = report_diag.get_visualization( property_name='Coverage',table_name = 'products')
fig2.show()

#Boundaries
fig3 = report_diag.get_visualization( property_name='Boundaries',table_name = 'products')
fig3.show()

#Save figures
fig.to_image('png')
fig2.to_image('png')
fig3.to_image('png')



b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xbc\x00\x00\x01\xf4\x08\x06\x00\x00\x00\xdfY\xfe\xdf\x00\x00 \x00IDATx^\xec\x9dy\xbcM\xd5\xfb\xc7?\xe6\xa92$\x91R\x94R\xa8\x88\x94R\xe6)C\xe6y\x9e\xe7\xe1\x1aB\xe6y\x9e\x87\x90\x08e\x88\x8c\x91!\x94H\x94\nE\xa62\x872e\xbc.\xbf\xd7\xb3\xfc\xce\xf9\xde{\xdd\xcb\xd5\xde\xcf:\xf7l\x9f\xfd\xcf\xf7\xdbu\xf6\xf3\xec\xf5~\xf6:\xeb\xbd\xd7^{\x9f87o\xde\xbc\tn$@\x02$@\x02$@\x02$@\x02$\xe0Q\x02q(\xbc\x1e\xad,\x9bE\x02$@\x02$@\x02$@\x02$`\x08Pxy"\x90\x00\t\x90\x00\t\x90\x00\t\x90\x00\tx\x9a\x00\x85\xd7\xd3\xe5e\xe3H\x80\x04H\x80\x04H\x80\x04H\x80\x04(\xbc<\x07H\x80\x04H\x80\x04H\x80\x04H\x80\x04<M\x80\xc2\xeb\xe9\xf2\xb2q$@\x02$@\x02$@\x02$@\x02\x14^\x9e\x03$@\x02$@\x02$@\x02$@\x02\x9e&@\xe1\xf5ty\xd98\x12 \x01\x12 \x01\x12 \x01\x12 \x01\n/\xcf\x01\x12 \x01\x12 \x01\x12 \x01\x12 \x01O\x13\xa0\xf0z\xba\xbcl\x1c\t\x90\x00\t\x90\x00\t\x90\x00\t\x90\x00\x85\x97\xe7\x00\t\x90\x00\t\x90\x00\t\x90\x00\t\x90\x80\xa7\tPx=]^6\x8e\x04H\x80\x04H\x80\x04H\x80\x

In [None]:
fig = utils.get_column_plot(
    real_data=tables,
    synthetic_data=new_data,
    column_name='unit_price',
    metadata=metadata_dict
)

fig.show()

In [None]:
fig = utils.get_column_pair_plot(
    real_data=tables,
    synthetic_data=new_data,
    column_names=['order_status','status'],
    metadata=metadata_dict   
)

fig.show()

In [None]:
ContingencySimilarity.compute(
    real_data=tables['customers'][['state', 'city']],
    synthetic_data=new_data['customers'][['state', 'city']]
)