#Installations

In [None]:
!pip install sdv

In [None]:
!pip install sdmetrics

#Imports

In [None]:
import pandas as pd

# Models
from sdv.tabular import CTGAN

# Saving/Loading models
import pickle

# Constraints
from sdv.constraints import FixedCombinations, Unique, Inequality

# Metrics
#from sdmetrics.reports.single_table import DiagnosticReport
from sdmetrics.reports.multi_table import QualityReport
from sdv.metrics.relational import KSComplement
from sdmetrics.multi_table import CardinalityShapeSimilarity
from sdv.evaluation import evaluate

# Timings
import time
import datetime

#Acces to Google Drive files for use on colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Load data

In [None]:
# Load aggregated dataset
#joined = pd.read_csv("drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/joined_table.csv", sep=";")
joined = pd.read_csv("/content/drive/MyDrive/PTUT - Synthetic/Data_Synthetic_Large/joined_half.csv", sep=";")


# can't have float as id (NaN values cause ints to become floats in this case)
#orders['shipper_id'] = orders['shipper_id'].astype(float).astype('Int64')

# Ensure datetime format
joined['order_date'] = pd.to_datetime(joined['order_date'], format="%d/%m/%Y")
joined['shipped_date'] = pd.to_datetime(joined['shipped_date'], format="%d/%m/%Y")

#Add constraints

In [None]:
city_state_constraint = FixedCombinations(column_names=['city', 'state'])
order_status_constraint = FixedCombinations(column_names=['status', 'order_status'])

# date inequality not working with NaT values

constraints = [city_state_constraint, order_status_constraint]

#Fit the model

In [None]:
model = CTGAN(verbose=True, anonymize_fields={'address': 'address', 'first_name': 'first_name', 'last_name': 'last_name', 'birth_date': 'date_of_birth'}, constraints=constraints)

# Time the fitting
start_time = time.time()

# Fit model
model.fit(joined)

end_time = time.time()

In [None]:
# Get training time (takes 2h24min on colab)
str(datetime.timedelta(end_time - start_time))

#Save/Load model

In [None]:
# Save model
model.save('joined_model.pkl')

In [None]:
# Load model
loaded = CTGAN.load('/content/drive/MyDrive/PTUT - Synthetic/joined_model.pkl')

sdv used version `0.17.1`; current version is ``
rdt used version `1.2.1`; current version is ``
ctgan used version `0.5.2`; current version is ``


#Generate a new data sample

In [None]:
# Generate new data
new_data = loaded.sample(num_rows=5000)

N.B. The quality and diagnostic reports can be found in the "Lecteur rapport" files