# Initial Note
After running experiments in Colab using open-source models from Hugging Face, I decided to do the exercise with OpenAI. The reason is that Llama 3.2 frequently did not follow the prompts correctly, leading to inconsistencies and poor performance. Additionally, using larger models significantly increased processing time, making them less practical for this task.

The code from this notebook will be reorganized in modules for the final Demo.

# Module to generate syntethic data

In [None]:

import re 

def _clean_json_output(raw_text: str) -> str:
    """
    Limpia la salida de OpenAI para convertirla en JSON válido:
    - Mantiene las comillas de claves sin tocar.
    - Escapa solo las comillas dobles dentro de los strings de valores.
    - Escapa \n, \r, \t.
    - Remueve code fences y HTML.
    - Asegura que el array comience con [ y termine con ].
    - Elimina comas finales.
    """
    text = raw_text.strip()
    
    # Remover code fences y HTML
    text = re.sub(r"```(?:json)?", "", text)
    text = re.sub(r"</?[^>]+>", "", text)
    
    # Escapar comillas dobles dentro de valores de Comment
    def escape_quotes_in_values(match):
        value = match.group(1)
        value = value.replace('"', r'\"')  # solo dentro del valor
        value = value.replace('\n', r'\n').replace('\r', r'\r').replace('\t', r'\t')
        return f'"{value}"'
    
    text = re.sub(r'"(.*?)"', escape_quotes_in_values, text)
    
    # Asegurar que empieza y termina con []
    if not text.startswith('['):
        text = '[' + text
    if not text.endswith(']'):
        text += ']'
    
    # Eliminar comas finales antes de cerrar corchetes
    text = re.sub(r',\s*]', ']', text)
    
    return text


In [14]:
import pandas as pd
import json
import openai
import tempfile


def generate_synthetic_data_openai(
    system_prompt: str,
    user_prompt: str,
    reference_file=None,
    openai_model="gpt-4o-mini",
    max_tokens=2048,
    temperature=0.0
):
    """
    Genera datos sintéticos y devuelve el DataFrame y la ruta de un CSV temporal.
    """
    # Preparar prompt completo
    if reference_file:
        if isinstance(reference_file, str):
            df_ref = pd.read_csv(reference_file)
        else:
            df_ref = pd.read_csv(reference_file)
        reference_data = df_ref.to_dict(orient="records")
        user_prompt_full = (
            f"{user_prompt}\nFollow the structure and distribution of the reference data, "
            f"but do NOT copy any exact values:\n{reference_data}"
        )
    else:
        user_prompt_full = user_prompt

    # Llamar a OpenAI
    response = openai.chat.completions.create(
        model=openai_model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt_full},
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )

    raw_text = response.choices[0].message.content
    cleaned_json = _clean_json_output(raw_text)

    # Parsear JSON
    try:
        data = json.loads(cleaned_json)
    except json.JSONDecodeError as e:
        raise ValueError(f"JSON inválido generado. Error: {e}\nOutput truncado: {cleaned_json[:500]}")

    df = pd.DataFrame(data)

    # Guardar CSV temporal
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    df.to_csv(tmp_file.name, index=False)
    tmp_file.close()

    return df, tmp_file.name


# Default prompts

In [19]:
SYSTEM_PROMPT = """
You are a precise synthetic data generator. Your only task is to output valid JSON arrays of dictionaries.

Rules:
1. Output a single JSON array starting with '[' and ending with ']'.
2. Do not include markdown, code fences, or explanatory text — only the JSON.
3. Keep all columns exactly as specified; do not add or remove fields (index must be omitted).
4. Respect data types: text, number, date, boolean, etc.
5. Ensure internal consistency and realistic variation.
6. If a reference table is provided, generate data with similar statistical distributions for numerical and categorical variables, 
   but never copy exact rows. Each row must be independent and new.
7. For personal information (names, ages, addresses, IDs), ensure diversity and realism — individual values may be reused to maintain realism, 
   but never reuse or slightly modify entire reference rows.
8. Escape all internal double quotes in strings with a backslash (\").
9. Replace any single quotes in strings with double quotes.
10. Escape newline (\n), tab (\t), or carriage return (\r) characters as \\n, \\t, \\r inside strings.
11. Remove any trailing commas before closing brackets.
12. Do not include any reference data or notes about it in the output.
13. The output must always be valid JSON parseable by standard JSON parsers.
"""

USER_PROMPT = """
Generate exactly 15 rows of synthetic data following all the rules above. 
Ensure that all strings are safe for JSON parsing and ready to convert to a pandas DataFrame.
"""


# Test

For testing our generator, we use the first 50 examples of reddit gaming comments with sentiments dataset.
Source: https://www.kaggle.com/datasets/sainitishmitta04/23k-reddit-gaming-comments-with-sentiments-dataset

In [16]:

df, _ = generate_synthetic_data_openai(SYSTEM_PROMPT, USER_PROMPT, reference_file= "data/sentiment_reference.csv")

In [18]:
df

Unnamed: 0,Comment,sentiment
0,"Them: ""I can't believe you made it this far!""\...",positive
1,Then you realize they have a better strategy t...,negative
2,There's nothing quite like the joy of sharing ...,positive
3,[This is a screenshot of my character](https:/...,negative
4,"Haha, I remember when my friend tried to intro...",positive
5,I'm currently experiencing this with my partne...,positive
6,Then they start to outshine you and you feel l...,positive
7,"###Take your time, it's all about the journey\...",positive
8,"Them: ""Wait, how did you do that?""\n\nMe: ""Jus...",neutral
9,"Don't worry about the mistakes, it's all part ...",positive


In [17]:
print(df.Comment[0])

Them: "I can't believe you made it this far!"

Me: "I just followed the map, it's not that hard!"


# Gradio Demo

In [5]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Synthetic Data Generator")

    with gr.Row():
        system_prompt_input = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT, lines=10)

    with gr.Row():
        user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5)

    with gr.Row():
        reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"])

    output_df = gr.DataFrame(label="Generated Data")
    download_csv = gr.File(label="Download CSV")

    generate_btn = gr.Button("🚀 Generate Data")

    generate_btn.click(
        fn=generate_synthetic_data_openai,
        inputs=[system_prompt_input, user_prompt_input, reference_input],
        outputs=[output_df, download_csv]
    )

demo.launch(debug=True)


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


