In [2]:

import gradio as gr
from transformers import pipeline
import pandas as pd

generator = pipeline("text-generation", model="gpt2")

def generate_synthetic_data(prompt, num_samples=3, max_length=80):
    """
    Generate synthetic dataset entries based on the user prompt.
    For example: 'Generate 5 fake job descriptions for software engineers.'
    """
    data = []
    for i in range(num_samples):
        output = generator(prompt, max_length=max_length, num_return_sequences=1, do_sample=True, temperature=0.9)
        text = output[0]['generated_text'].strip()
        data.append({"id": i + 1, "synthetic_text": text})

    df = pd.DataFrame(data)
    return df


demo = gr.Interface(
    fn=generate_synthetic_data,
    inputs=[
        gr.Textbox(label="Describe the dataset you want to generate", placeholder="e.g., Generate 3 fake job postings for data scientists"),
        gr.Number(label="Number of samples", value=3)
    ],
    outputs=gr.Dataframe(label="Generated Synthetic Data"),
    title="🧠 Synthetic Data Generator",
    description=(
        "Generate synthetic data for any business domain using an open-source text generation model. "
        "Useful for creating test datasets such as product descriptions, job postings, customer reviews, etc."
    ),
    examples=[
        ["Generate 5 fake product descriptions for a tech company", 5],
        ["Create 3 synthetic job listings for software engineers", 3],
        ["Generate 4 customer reviews for a travel agency", 4]
    ]
)

demo.launch(share=False)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

