#Install Requirements

In [None]:
!pip install sdv

In [None]:
!pip install sdmetrics

In [None]:
!pip install kaleido

In [None]:
%pip install pandas==1.5.2

In [None]:
!pip install plotly==5.10.0

#Acces to Google Drive files for use on colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Imports

In [None]:
import pandas as pd

# Models
from sdv.tabular import CTGAN

# Saving/Loading models
import pickle

# Constraints
from sdv.constraints import FixedCombinations, Unique, Inequality

# Metrics
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from sdv.evaluation import evaluate
from sdmetrics.reports import utils

# Timings
import time
import datetime

#Load data

In [None]:
# Load aggregated dataset
#joined = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/joined_table.csv", sep=";")
joined = pd.read_csv("/content/drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/joined_half.csv", sep=";")

# Ensure datetime format
joined['order_date'] = pd.to_datetime(joined['order_date'], format="%d/%m/%Y")
joined['shipped_date'] = pd.to_datetime(joined['shipped_date'], format="%d/%m/%Y")

#Load models

In [None]:
# Load your model
model_constraint = pd.read_pickle("/content/drive/MyDrive/PTUT - Synthetic/joined_model_constraints.pkl")
model_normal = CTGAN.load('/content/drive/MyDrive/PTUT - Synthetic/joined_model.pkl')

#Generate new samples

In [None]:
# Generate new data
new_data = model_constraint.sample(num_rows=5000)
new_data2 = model_normal.sample(num_rows=5000)

In [None]:
# Evaluate the new data
evaluate(new_data, joined)

evaluate(new_data2, joined)

#GENERATE REPORT FOR THE SINGLE MODEL WITH CONSTRAINTS

In [None]:
metadata = model_constraint.get_metadata()
metadata_dict = metadata.to_dict()
metadata_dict

In [None]:
#Load the report
report_diag = DiagnosticReport.load('/content/drive/MyDrive/PTUT - Synthetic/Rapports/diagnostic_report.pkl')
report_qual = QualityReport.load('/content/drive/MyDrive/PTUT - Synthetic/Rapports/quality_report.pkl')

In [None]:
#Quality Score 
Score = report_qual.get_score()
print("Score =", Score)
 
#Column Shapes
fig = report_qual.get_visualization(property_name='Column Shapes')
fig.show()

#Column Pair Trends
fig2 = report_qual.get_visualization(property_name='Column Pair Trends')
fig2.show()

#Save figures
fig.to_image('png')
fig2.to_image('png')

Score = 0.7089177187935858


b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03\x84\x00\x00\x03\x84\x08\x06\x00\x00\x00\xf9N\xff;\x00\x00 \x00IDATx^\xec\x9d\x05tTG\x1b\x86\xdf$\xb8\xbb\xbb\x17+R\xa4\x14ww\xd7`\xc1%\x10,\xb8\x05wwww+\x14/\x0e-R\xa4\xb8k\xa1P\xa0\x10\xf9\xcf7\xfc\xbb\x8dl\xc2^v\xb3ww\xf3\xdes8\xa7M\xe6\x8e<3\xbb\xb9\xcf\xfdF\\\x02\x02\x02\x02\xc0\x8b\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04H \xc2\x11p\xa1\x10F\xb8>g\x83I\x80\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04H@\x11\xa0\x10r \x90\x00\t\x90\x00\t\x90\x00\t\x90\x00\t\x90\x00\t\x90@\x04%@!\x8c\xa0\x1d\xcff\x93\x00\t\x90\x00\t\x90\x00\t\x90\x00\t\x90\x00\t\x90\x00\x85\x90c\x80\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04"(\x01\na\x04\xedx6\x9b\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04H\x80\x04(\x84\x1c\x03$@\x02$@\x02$@\x02$@\x02$@\x02$\x10A\tP\x08#h\xc7\xb3\xd9$@\x02$@\x02$@\x02$@\x02$@\x02$@!\xe4\x18 \x01\x12 \x01\x12 \x01\x12 \x01\x12 \x01\x12 \x81\x08J\x80B\x18A;\x9e\xcd&\x01\x12 \x01\x12 \x01\x12 \x01\x12 \x01\x12 \x01\n!\xc7\x00\t\x90

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_plot(
    real_data=joined,
    synthetic_data=new_data2,
    column_name='unit_price',
    metadata=metadata_dict
)

fig.show()

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_pair_plot(
    real_data=joined,
    synthetic_data=new_data,
    column_names=['order_status','status'],
    metadata=metadata_dict   
)

fig.show()

In [None]:
#Diagnostic Score 
results = report_diag.get_results()
print("Resultats =", results)
 
#Synthesis
fig = report_diag.get_visualization( property_name='Synthesis')
fig.show()

#Coverage
fig2 = report_diag.get_visualization( property_name='Coverage')
fig2.show()

#Boundaries
fig3 = report_diag.get_visualization( property_name='Boundaries')
fig3.show()

#Save figures
fig.to_image('png')
fig2.to_image('png')
fig3.to_image('png')



b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xbc\x00\x00\x01\xf4\x08\x06\x00\x00\x00\xdfY\xfe\xdf\x00\x00 \x00IDATx^\xec\xddy\xbcM\xf5\xf7\xc7\xf1e\x1e\x1aD\x92R\x8aR**\xbe4h2O\x99\xe7y\x9e\x92\xf9\x1a\x8a\xcc\xf3<\x0f\x95DT\x86\xccD\x86\xd0@\xa2T(2\x15\xa1\x10!\xf3\xf4{\xac\x8f\xdf\xb9\xdd\xd1]w\xdf\xcb\xdd\xf7z\xed\x7f~\xbf\xaf\xd6=g\x9f\xe7g\x9d\xb3\xdf\xe7\xb3?{\x9fDW\xae\\\xb9"l\x08 \x80\x00\x02\x08 \x80\x00\x02\x08$P\x81D\x04\xde\x04:\xb2\xbc,\x04\x10@\x00\x01\x04\x10@\x00\x01\'@\xe0\xa5\x11\x10@\x00\x01\x04\x10@\x00\x01\x04\x12\xb4\x00\x817A\x0f//\x0e\x01\x04\x10@\x00\x01\x04\x10@\x80\xc0K\x0f \x80\x00\x02\x08 \x80\x00\x02\x08$h\x01\x02o\x82\x1e^^\x1c\x02\x08 \x80\x00\x02\x08 \x80\x00\x81\x97\x1e@\x00\x01\x04\x10@\x00\x01\x04\x10H\xd0\x02\x04\xde\x04=\xbc\xbc8\x04\x10@\x00\x01\x04\x10@\x00\x01\x02/=\x80\x00\x02\x08 \x80\x00\x02\x08 \x90\xa0\x05\x08\xbc\tzxyq\x08 \x80\x00\x02\x08 \x80\x00\x02\x04^z\x00\x01\x04\x10@\x00\x01\x04\x10@ A\x0b\x10x\x13\xf4\xf0\xf2\xe2\x10@\x00\x01

In [None]:
from sdmetrics.column_pairs import ContingencySimilarity

ContingencySimilarity.compute(
    real_data = joined[['state', 'city']],
    synthetic_data=new_data[['state', 'city']]
)

fig = utils.get_(
    real_data=joined,
    synthetic_data=new_data2,
    column_name='unit_price',
    metadata=metadata_dict
)

fig.show()

0.6439128451380552

#GENERATE REPORT FOR THE SINGLE MODEL WITHOUT CONSTRAINTS

In [None]:
metadata1 = model_normal.get_metadata()
metadata_dict2 = metadata1.to_dict()
metadata_dict2

In [None]:
#Diagnostic Report Gen
from sdmetrics.reports.single_table import DiagnosticReport
diag_report = DiagnosticReport()
diag_report.generate(joined, new_data2, metadata_dict2)

#Quality Report Gen
from sdmetrics.reports.single_table import QualityReport
qual_report = QualityReport()
qual_report.generate(joined, new_data2, metadata_dict2)

#Save the reports
qual_report.save(filepath='/content/drive/MyDrive/PTUT - Synthetic/quality_report2.pkl')
diag_report.save(filepath='/content/drive/MyDrive/PTUT - Synthetic/diagnostic_report2.pkl')

In [None]:
#Load the report
report_diag = DiagnosticReport.load('/content/drive/MyDrive/PTUT - Synthetic/diagnostic_report2.pkl')
report_qual = QualityReport.load('/content/drive/MyDrive/PTUT - Synthetic/quality_report2.pkl')

In [None]:
from sdmetrics.reports import utils

#Quality Score 
Score = report_qual.get_score()
print("Score =", Score)
 
#Column Shapes
fig = report_qual.get_visualization(property_name='Column Shapes')
fig.show()

#Column Pair Trends
fig2 = report_qual.get_visualization(property_name='Column Pair Trends')
fig2.show()

#Save figures
fig.to_image('png')
fig2.to_image('png')

In [None]:
report_qual.get_details(property_name='Column Shapes')

In [None]:
report_qual.get_details(property_name='Column Pair Trends')

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_plot(
    real_data=joined,
    synthetic_data=new_data2,
    column_name='unit_price',
    metadata=metadata_dict
)

fig.show()

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_pair_plot(
    real_data=joined,
    synthetic_data=new_data2,
    column_names=['order_status','status'],
    metadata=metadata_dict   
)

fig.show()