now let's think about how we can automate everything

In [1]:
pip install sdv

Collecting sdv
  Downloading sdv-1.20.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.37.34-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.37.34-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.16.0 (from sdv)
  Downloading rdt-1.16.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.20.1 (from sdv)
  Downloading sdmetrics-0.20.1-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [2]:
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import warnings

In [3]:
warnings.filterwarnings('ignore')

In [26]:
class SyntheticDataGenerator:
    def __init__(self, model_path=None):
        self.model_path = model_path
        self.model = None
        self.metadata = None
        self.original_columns = []

    def _validate_input(self, data):
        required_columns = ['subject', 'verb', 'object']
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"The mandatory column '{col}' is missing from the data")
        self.original_columns = data.columns.tolist()  # preserving the original speakers

    def _generate_dummy_columns(self, data):
        # One-hot for 'verb' and 'object'
        for col in ['verb', 'object']:
            for value in data[col].unique():
                dummy_col = f"{col}_{value}"
                data[dummy_col] = (data[col] == value).astype(int)

        # fictitious numeric columns
        data['amount'] = np.random.uniform(10, 1000, size=len(data))
        data['transaction_id'] = np.arange(len(data))

        # timestamps
        data['timestamp'] = pd.to_datetime('2023-01-01') + pd.to_timedelta(
            np.random.randint(0, 365, len(data)), 'days'
        )

        # masks and flags
        data['is_valid'] = 1
        data['fraud_flag'] = 0

        return data

    def _train_model(self, data, epochs=100):
        self.metadata = SingleTableMetadata()
        self.metadata.detect_from_dataframe(data)
        self.model = CTGANSynthesizer(self.metadata, epochs=epochs)
        self.model.fit(data)

        if self.model_path:
            self.model.save(self.model_path)

    def generate(self, input_data, num_rows=100, retrain_on_error=True):
        try:
            self._validate_input(input_data)
            enriched_data = self._generate_dummy_columns(input_data.copy())

            if not self.model:
                if self.model_path:
                    self.model = CTGANSynthesizer.load(self.model_path)
                else:
                    self._train_model(enriched_data)

            synthetic_data = self.model.sample(num_rows=num_rows)

            # restore original values from one-hot
            for col_type in ['verb', 'object']:
                for col in synthetic_data.columns:
                    if col.startswith(f"{col_type}_"):
                        mask = synthetic_data[col] == 1
                        synthetic_data.loc[mask, col_type] = col.replace(f"{col_type}_", "")

            # filtering of source columns only
            result = synthetic_data[self.original_columns].copy()

            return result

        except Exception as e:
            if retrain_on_error:
                print(f"Error: {str(e)}\nBegin retraining the model...")
                self._train_model(enriched_data)
                return self.generate(input_data, num_rows, retrain_on_error=False)
            else:
                raise RuntimeError("Failed to generate data after retraining")

example of use

In [36]:
# example of use
if __name__ == "__main__":
    # user uploads their data
    user_data = pd.DataFrame({
        'subject': ['user1', 'user2', 'user3'],
        'verb': ['buy', 'return', 'order'],
        'object': ['laptop', 'book', 'phone']
    })

    # generator initialization (you can specify the path to save the model)
    generator = SyntheticDataGenerator(model_path='ctgan_model.pkl')

    # data gen
    try:
        synthetic_df = generator.generate(user_data, num_rows=50)
        print("Generated data:")
        print(synthetic_df.head())

        synthetic_df.to_csv('synthetic_transactions.csv', index=False)
        print("The results are saved in synthetic_transactions.csv")

    except Exception as e:
        print(f"Critical error: {str(e)}")

Generated data:
  subject    verb  object
0   user3  return   phone
1   user3   order   phone
2   user2   order    book
3   user1   order  laptop
4   user1   order    book
The results are saved in synthetic_transactions.csv


In [37]:
# example of use
if __name__ == "__main__":
    # user uploads their data
    user_data = pd.DataFrame({
        'subject': ['user1', 'user2', 'user3'],
        'verb': ['buy', 'return', 'order'],
        'object': ['laptop', 'book', 'phone'],
        'amount': [100, 50, 75]
    })

    # generator initialization (you can specify the path to save the model)
    generator = SyntheticDataGenerator(model_path='ctgan_model.pkl')

    # data gen
    try:
        synthetic_df = generator.generate(user_data, num_rows=50)
        print("Generated data:")
        print(synthetic_df.head())

        synthetic_df.to_csv('synthetic_transactions.csv', index=False)
        print("The results are saved in synthetic_transactions.csv")

    except Exception as e:
        print(f"Critical error: {str(e)}")

Generated data:
  subject    verb  object      amount
0   user3  return   phone  526.484494
1   user3   order   phone  609.341797
2   user2   order    book  419.866854
3   user1   order  laptop  419.866854
4   user1   order    book  419.866854
The results are saved in synthetic_transactions.csv


In [38]:
# example of use
if __name__ == "__main__":
    # user uploads their data
    user_data = pd.DataFrame({
        'subject': ['user1', 'user2', 'user3'],
        'verb': ['buy', 'return', 'order'],
        'object': ['laptop', 'book', 'phone'],
        'timestamp': ['2023-01-01', '2023-01-02', '2023-01-03']
    })

    # generator initialization (you can specify the path to save the model)
    generator = SyntheticDataGenerator(model_path='ctgan_model.pkl')

    # data gen
    try:
        synthetic_df = generator.generate(user_data, num_rows=50)
        print("Generated data:")
        print(synthetic_df.head())

        synthetic_df.to_csv('synthetic_transactions.csv', index=False)
        print("The results are saved in synthetic_transactions.csv")

    except Exception as e:
        print(f"Critical error: {str(e)}")

Generated data:
  subject    verb  object  timestamp
0   user3  return   phone 2023-11-01
1   user3   order   phone 2023-07-15
2   user2   order    book 2023-05-21
3   user1   order  laptop 2023-03-07
4   user1   order    book 2023-07-14
The results are saved in synthetic_transactions.csv


In [39]:
# example of use
if __name__ == "__main__":
    # user uploads their data
    user_data = pd.DataFrame({
        'subject': ['user1', 'user2', 'user3'],
        'verb': ['buy', 'return', 'order'],
        'object': ['laptop', 'book', 'phone'],
        'user_name': ['John', 'Alice', 'Bob'],
        'user_age': [30, 25, 35]
    })

    # generator initialization (you can specify the path to save the model)
    generator = SyntheticDataGenerator(model_path='ctgan_model.pkl')

    # data gen
    try:
        synthetic_df = generator.generate(user_data, num_rows=50)
        print("Generated data:")
        print(synthetic_df.head())

        synthetic_df.to_csv('synthetic_transactions.csv', index=False)
        print("The results are saved in synthetic_transactions.csv")

    except Exception as e:
        print(f"Critical error: {str(e)}")

Error: "['user_name', 'user_age'] not in index"
Begin retraining the model...
Generated data:
  subject    verb  object user_name  user_age
0   user3   order   phone      John        25
1   user3   order  laptop      John        25
2   user2   order  laptop     Alice        31
3   user1  return  laptop     Alice        35
4   user3   order    book       Bob        25
The results are saved in synthetic_transactions.csv
