In [1]:
pip install sdv torch

Collecting sdv
  Downloading sdv-1.19.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.37.30-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.37.30-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.14.0 (from sdv)
  Downloading rdt-1.15.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.19.0 (from sdv)
  Downloading sdmetrics-0.19.0-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)

In [29]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import CTGANSynthesizer

In [3]:
transactions_df = pd.read_csv('/content/transactions.csv')
training_df = pd.read_csv('/content/training_data.csv')

In [4]:
transactions_df['user_id'] = transactions_df['subject'].str.replace('User ', '').astype(int)

In [5]:
transactions_df.head()

Unnamed: 0,subject,verb,object,time,amount,mask,user_id
0,User 40578,bought,book,2023-01-11,1524,"[1, 1, 1, 1, 1]",40578
1,User 94768,paid,laptop,2022-11-23,-1,"[1, 1, 1, 1, 0]",94768
2,User 65142,paid,service,2023-03-25,-1,"[1, 1, 1, 1, 0]",65142
3,User 10499,returned,book,2023-12-19,-1,"[1, 1, 1, 1, 0]",10499
4,User 89538,ordered,phone,2025-07-13,3949,"[1, 1, 1, 1, 1]",89538


In [6]:
training_df.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
training_df.head()

Unnamed: 0,user_id,verb_bought,verb_cancelled,verb_ordered,verb_paid,verb_returned,verb_sold,object_accessory,object_book,object_laptop,object_phone,object_product,object_service,mask_time,mask_amount,unix_time_norm,amount_norm
0,40578,True,False,False,False,False,False,False,True,False,False,False,False,1,1,0.99147,0.652465
1,94768,False,False,False,True,False,False,False,False,True,False,False,False,1,0,0.991085,0.0
2,65142,False,False,False,True,False,False,False,False,False,False,False,True,1,0,0.992044,0.0
3,10499,False,False,False,False,True,False,False,True,False,False,False,False,1,0,0.994159,0.0
4,89538,False,False,True,False,False,False,False,False,False,True,False,False,1,1,0.998656,0.894989


In [8]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           20000 non-null  int64  
 1   verb_bought       20000 non-null  bool   
 2   verb_cancelled    20000 non-null  bool   
 3   verb_ordered      20000 non-null  bool   
 4   verb_paid         20000 non-null  bool   
 5   verb_returned     20000 non-null  bool   
 6   verb_sold         20000 non-null  bool   
 7   object_accessory  20000 non-null  bool   
 8   object_book       20000 non-null  bool   
 9   object_laptop     20000 non-null  bool   
 10  object_phone      20000 non-null  bool   
 11  object_product    20000 non-null  bool   
 12  object_service    20000 non-null  bool   
 13  mask_time         20000 non-null  int64  
 14  mask_amount       20000 non-null  int64  
 15  unix_time_norm    20000 non-null  float64
 16  amount_norm       20000 non-null  float6

In [9]:
merged_df = pd.merge(
    training_df,
    transactions_df,
    on="user_id",
    how="left"
).drop(columns=["subject"])

In [10]:
merged_df.head()

Unnamed: 0,user_id,verb_bought,verb_cancelled,verb_ordered,verb_paid,verb_returned,verb_sold,object_accessory,object_book,object_laptop,...,object_service,mask_time,mask_amount,unix_time_norm,amount_norm,verb,object,time,amount,mask
0,40578,True,False,False,False,False,False,False,True,False,...,False,1,1,0.99147,0.652465,bought,book,2023-01-11,1524,"[1, 1, 1, 1, 1]"
1,94768,False,False,False,True,False,False,False,False,True,...,False,1,0,0.991085,0.0,paid,laptop,2022-11-23,-1,"[1, 1, 1, 1, 0]"
2,65142,False,False,False,True,False,False,False,False,False,...,True,1,0,0.992044,0.0,paid,service,2023-03-25,-1,"[1, 1, 1, 1, 0]"
3,10499,False,False,False,False,True,False,False,True,False,...,False,1,0,0.994159,0.0,returned,book,2023-12-19,-1,"[1, 1, 1, 1, 0]"
4,10499,False,False,False,False,True,False,False,True,False,...,False,1,0,0.994159,0.0,cancelled,book,2023-04-22,-1,"[1, 1, 1, 1, 0]"


In [11]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(merged_df)

In [12]:
metadata.update_column(
    column_name='user_id',
    sdtype='id',
    regex_format=r'User \d{5}'
)

metadata.update_column(
    column_name='verb_bought',
    sdtype='boolean'
)

metadata.update_column(
    column_name='amount_norm',
    sdtype='numerical'
)

In [13]:
merged_df['synthetic_id'] = range(1, len(merged_df) + 1)
metadata.add_column('synthetic_id', sdtype='id')
metadata.set_primary_key('synthetic_id')

In [14]:
metadata.validate()

In [15]:
synthesizer = CTGANSynthesizer(
    metadata,
    enforce_rounding=False,
    epochs=100,
    batch_size=1000,
    verbose=True
)



In [16]:
synthesizer.fit(merged_df)

PerformanceAlert: Using the CTGANSynthesizer on this data is not recommended. To model this data, CTGAN will generate a large number of columns.

Original Column Name   Est # of Columns (CTGAN)
verb_bought            2
verb_cancelled         2
verb_ordered           2
verb_paid              2
verb_returned          2
verb_sold              2
object_accessory       2
object_book            2
object_laptop          2
object_phone           2
object_product         2
object_service         2
mask_time              2
mask_amount            2
unix_time_norm         11
amount_norm            11
verb                   6
object                 6
time                   1462
amount                 11
mask                   4

We recommend preprocessing discrete columns that can have many values, using 'update_transformers'. Or you may drop columns that are not necessary to model. (Exit this script using ctrl-C)


Gen. (-0.60) | Discrim. (-0.34): 100%|██████████| 100/100 [03:33<00:00,  2.13s/it]


In [17]:
synthetic_data = synthesizer.sample(num_rows=20000)

In [19]:
# verb correction
for index, row in synthetic_data.iterrows():
    verb = row["verb"]
    synthetic_data.at[index, f"verb_{verb}"] = True
    for col in synthetic_data.columns:
        if col.startswith("verb_") and col != f"verb_{verb}":
            synthetic_data.at[index, col] = False

In [21]:
# object correction
for index, row in synthetic_data.iterrows():
    obj = row["object"]
    synthetic_data.at[index, f"object_{obj}"] = True
    for col in synthetic_data.columns:
        if col.startswith("object_") and col != f"object_{obj}":
            synthetic_data.at[index, col] = False

In [23]:
pd.set_option('display.max_columns', None)

In [24]:
synthetic_data.head()

Unnamed: 0,user_id,verb_bought,verb_cancelled,verb_ordered,verb_paid,verb_returned,verb_sold,object_accessory,object_book,object_laptop,object_phone,object_product,object_service,mask_time,mask_amount,unix_time_norm,amount_norm,verb,object,time,amount,mask,synthetic_id
0,User 05238,False,False,False,False,False,True,False,False,False,True,False,False,0,0,0.982253,0.0,sold,phone,2024-05-04,16,"[1, 1, 1, 0, 0]",14702678
1,User 16772,False,False,True,False,False,False,False,False,False,False,True,False,1,0,0.997952,0.0,ordered,product,2024-09-01,-3,"[1, 1, 1, 1, 0]",1688051
2,User 05777,False,True,False,False,False,False,True,False,False,False,False,False,1,0,0.984748,0.0,cancelled,accessory,[PAD],-19,"[1, 1, 1, 1, 0]",5506731
3,User 18236,False,False,False,False,False,True,False,False,False,False,True,False,1,0,1.0,0.0,sold,product,2023-11-04,-64,"[1, 1, 1, 1, 0]",972145
4,User 16612,False,False,True,False,False,False,False,False,True,False,False,False,1,1,0.986398,0.580751,ordered,laptop,2025-12-15,953,"[1, 1, 1, 1, 1]",2185867


In [26]:
quality_report = evaluate_quality(
    real_data=merged_df,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 23/23 [00:00<00:00, 45.90it/s]|
Column Shapes Score: 92.06%

(2/2) Evaluating Column Pair Trends: |██████████| 253/253 [00:03<00:00, 82.50it/s]|
Column Pair Trends Score: 87.51%

Overall Score (Average): 89.79%



the result is excellent, we won't increase the epochs for now.

In [30]:
# model saving
synthesizer.save("ctgan_model.pkl")

In [31]:
# model load
synthesizer = CTGANSynthesizer.load("ctgan_model.pkl")