### Using LLMs to generate synthetic data

In [None]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Name': ['John', 'Alice', 'Bob', 'Emma', 'Michael', 'Sophia', 'William', 'Olivia', 'James', 'Charlotte'],
    'Age': [30, 25, 35, 28, 32, 27, 40, 22, 38, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Philadelphia', 'Phoenix', 'San Antonio', 'San Diego', 'Dallas', 'San Jose']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Convert DataFrame to text format
text_data = ""
for index, row in df.iterrows():
    text_data += f"Name: {row['Name']}, Age: {row['Age']}, City: {row['City']}\n"

# Write text data to a file
with open("your_training_data.txt", "w") as file:
    file.write(text_data)

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # or any other variant of GPT-2
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare training data (tokenize and encode)
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="your_training_data.txt",  # Path to your prepared training data
    block_size=125  # Adjust according to your data size
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

2024-02-29 20:59:35.943866: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-29 20:59:35.943930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-29 20:59:35.946245: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-29 20:59:35.959581: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Step,Training Loss


TrainOutput(global_step=5, training_loss=1.324582290649414, metrics={'train_runtime': 20.7901, 'train_samples_per_second': 0.24, 'train_steps_per_second': 0.24, 'total_flos': 318960000000.0, 'train_loss': 1.324582290649414, 'epoch': 5.0})

In [None]:
# Generate new rows using the fine-tuned model
generated_data = []

for index, row in df.iterrows():
    # Example prompt: "Name: John, Age: 30, City: New York"
    prompt = f"Name: {row['Name']}, Age: {row['Age']}, City: {row['City']}"
    
    # Generate continuation using the fine-tuned model
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)
    
    # Parse generated text to extract new values
    generated_values = generated_text.split(", ")
    new_row = {
        'Name': generated_values[0].split(": ")[1],
        'Age': int(generated_values[1].split(": ")[1]),
        'City': generated_values[2].split(": ")[1]
    }
    generated_data.append(new_row)

# Create a new DataFrame with the generated data
new_df = pd.DataFrame(generated_data)
print("\nNew DataFrame with Generated Rows:")
print(new_df)