# Synthetic Data Generation Using Tabula

In [None]:
# Clone the Tabula repository to access required utilities
!git clone https://github.com/zhao-zilong/Tabula.git

# Install required dependencies
!pip install datasets>=2.5.2
!pip install numpy>=1.24.2
!pip install tqdm>=4.64.1
!pip install transformers>=4.22.1
!pip install pandas>=1.4.4
!pip install scikit_learn>=1.1.1
!pip install torch>=1.10.2

# Add the Tabula repository path to Python's system path
import sys
sys.path.append('/content/Tabula')  # Adjust the path if necessary to point to the Tabula directory

# Import necessary libraries for data handling, machine learning, and transformers
import os
import random
import warnings
import logging
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, AutoConfig
from datasets import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")


# Load the Real Dataset

In [4]:
import pandas as pd
# Load the insurance dataset from the specified URL
data_url = "https://raw.githubusercontent.com/zhao-zilong/Tabula/main/Real_Datasets/Insurance_compressed.csv"

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(data_url)

# Display the first few rows of the dataset for inspection
print("Original Dataset Preview:")
print(data.head())

# Check the dataset information for data types and missing values
print("\nDataset Info:")
print(data.info())


Original Dataset Preview:
   age  sex     bi  children  sm  region     charges
0   18    0  53.13         0   0       0   1163.4627
1   22    0  52.58         1   1       0  44501.3982
2   23    0  50.38         1   0       0   2438.0552
3   58    0  49.06         0   0       0  11381.3254
4   46    1  48.07         2   0       3   9432.9253

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bi        1338 non-null   float64
 3   children  1338 non-null   int64  
 4   sm        1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB
None


# Define utility functions

In [5]:
# Function to convert a Numpy array into a Pandas DataFrame
def _array_to_dataframe(data, columns=None):
    """Convert a numpy array to a pandas DataFrame."""
    if isinstance(data, pd.DataFrame):
        return data
    assert isinstance(data, np.ndarray), "Input data must be a Pandas DataFrame or Numpy NDArray"
    assert columns, "Column names must be provided for conversion"
    return pd.DataFrame(data=data, columns=columns)

# Function to get the distribution of a column
def _get_column_distribution(df, col):
    """Extract distribution information for a specified column."""
    if df[col].dtype in ["float", "int"]:
        return df[col].to_list()
    else:
        return df[col].value_counts(normalize=True).to_dict()

# Function to pad tokens for consistency in sequence length
def _pad(sequence, length, pad_value=50256):
    """Pad sequences to a fixed length with a specified value."""
    return [pad_value] * (length - len(sequence)) + sequence

# Function to pad a batch of token sequences
def _pad_tokens(tokens):
    """Pad multiple sequences to the same length."""
    max_length = max(len(token) for token in tokens)
    return [_pad(token, max_length) for token in tokens]

# Function to seed random number generators for reproducibility
def _seed_worker(_):
    """Set seed for random number generators."""
    worker_seed = torch.initial_seed() % 2**32
    random.seed(worker_seed)
    np.random.seed(worker_seed)
    torch.manual_seed(worker_seed)
    torch.cuda.manual_seed_all(worker_seed)


# Initialize Tabula Model


In [15]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [20]:
def fit(self, data):
    """Train the Tabula model."""
    print("Preparing data for training...")
    self.columns = data.columns  # Store column information

    # Encode categorical columns if applicable
    if self.categorical_columns:
        data = self.encode_categorical_column(data)

    # Convert data to text format
    data['text'] = data.apply(lambda row: ", ".join([f"{col} {row[col]}" for col in data.columns]), axis=1)

    # Tokenize the text data
    def tokenize_function(examples):
        tokens = self.tokenizer(
            examples["text"], truncation=True, padding=True, max_length=128
        )
        tokens["labels"] = tokens["input_ids"].copy()  # Add labels for loss computation
        return tokens

    # Convert to Hugging Face Dataset and tokenize
    dataset = Dataset.from_pandas(data[['text']])
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=self.experiment_dir,
        num_train_epochs=self.epochs,
        per_device_train_batch_size=self.batch_size,
        save_strategy="no",
        logging_strategy="no",
        report_to="none",  # Disable wandb and other logging integrations
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=self.model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    # Train the model
    print("Training the model...")
    trainer.train()


In [23]:
def sample(self, n_samples, temperature=0.7, k=100, max_length=50):
    """
    Generate synthetic samples using the pre-trained Tabula model.
    """
    print(f"Generating {n_samples} synthetic samples...")
    self.model.eval()
    synthetic_data = []

    for _ in range(n_samples):
        # Use a default prompt or any starting text
        inputs = self.tokenizer("START", return_tensors="pt", padding=True, truncation=True)

        # Add attention_mask and set pad_token_id
        input_ids = inputs["input_ids"].to(self.model.device)
        attention_mask = inputs["attention_mask"].to(self.model.device)

        # Generate text with pre-trained model
        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            do_sample=True,
            top_k=k,
            temperature=temperature,
            pad_token_id=self.tokenizer.eos_token_id  # Explicitly handle padding
        )

        # Decode generated text and collect results
        decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        synthetic_data.append(decoded)

    # Return generated data as a DataFrame for easy use
    return pd.DataFrame(synthetic_data, columns=["Generated Data"])


In [None]:

# Initialize and train the model
model = Tabula(
    llm="distilgpt2",
    experiment_dir="insurance_training",
    epochs=10,
    batch_size=32,
    categorical_columns=["sex", "children", "smoker", "region"]
)

print("Training the Tabula model...")
model.fit(data)

print("\nGenerating synthetic data...")
synthetic_data = model.sample(n_samples=1338)

# Save the synthetic data
synthetic_data.to_csv("synthetic_insurance_data.csv", index=False)

# Display the synthetic data
print("\nSynthetic Data Preview:")
print(synthetic_data.head())

Training the Tabula model...
Preparing data for training...


Map:   0%|          | 0/1338 [00:00<?, ? examples/s]

Training the model...


Step,Training Loss


Step	Training Loss
500	1.820100

1500	1.439500

2000	1.310000

2500	1.189100

3000	1.089400

3500	1.025200

4000	0.980300

4500	0.946100

5000	0.913400

5500	0.884500

6000	0.854700

6500	0.831000

7000	0.809000

7500	0.788700

8000	0.772800

8500	0.759400

9000	0.746300

9500	0.735700

10000	0.727500

10500	0.718800

11000	0.713400

11500	0.706200

12000	0.699800

12500	0.695900

13000	0.691300

13500	0.688900

14000	0.686600

14500	0.683100

15000	0.681300

15500	0.678600

16000	0.677100

16500	0.676500




