<a href="https://colab.research.google.com/github/NITHIN0710/llm-engineering/blob/main/Systhesis_Dataset_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade transformers bitsandbytes accelerate gradio pandas sentencepiece

In [None]:
# imports

import torch
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, TextStreamer
from IPython.display import display, Markdown
import gradio as gr
import json
import pandas as pd
import re

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

# MODEL

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
# Quantization Setup

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# Load Tokenizer and Model

tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
print("âœ… LLaMA 3.2 3B model loaded with 4-bit quantization on GPU")

In [None]:
# Building Prompt

def build_prompt(dataset_description, num_samples):
    prompt = f"""
You are a synthetic dataset generator.

Task:
Generate {num_samples} synthetic records for the following dataset:

"{dataset_description}"

Rules:
- Output ONLY valid JSON
- The JSON must be an array of objects
- Infer column names from the dataset description
- Use realistic but fake data
- Keep the same schema for all records
- Do NOT use placeholder names like field1, field2
- No explanations, no extra text

Example (schema only, not actual data):
If description is "student name and age":
[
  {{
    "name": "string",
    "age": number
  }}
]

Now generate the actual dataset.
"""
    return prompt


In [None]:
# Message Construction (CHAT Format)

def build_messages(dataset_description, num_samples):
  system_message = "You are a helpful AI assistant that generates synthesis dataset based on user's description of dataset"
  user_message = build_prompt(dataset_description, num_samples)

  messages = [{"role": "system", "content": system_message}, {"role": "user", "content": user_message}]
  return messages

In [None]:
def generate_synthesis_dataset(dataset_description, num_samples):
    messages = build_messages(dataset_description, num_samples)

    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    ).to("cuda")

    input_len = inputs.shape[-1]

    outputs = model.generate(
        inputs,
        max_new_tokens=500,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True
    )

    # decode newly generated tokens (assistant reply)
    generated_tokens = outputs.sequences[0][input_len:]
    decoded_output = tokenizer.decode(
        generated_tokens,
        skip_special_tokens=True
    )

    return decoded_output


In [None]:
# Model Output to Table

def json_to_dataframe(text):
    try:
        if not text:
            raise ValueError("Empty model output")

        # Remove special tokens explicitly
        text = text.replace("<|eot_id|>", "").strip()

        # Extract first JSON array using regex
        match = re.search(r"\[\s*\{.*?\}\s*\]", text, re.DOTALL)

        if not match:
            raise ValueError("No JSON array found")

        json_text = match.group(0)

        data = json.loads(json_text)
        return pd.DataFrame(data)

    except Exception as e:
        return pd.DataFrame(
            {
                "error": ["Failed to parse JSON output"],
                "details": [str(e)],
                "raw_output": [text[:300]]  # debugging aid
            }
        )


In [None]:
# Gradio Function

def gradio_generate(dataset_description, num_samples):
  raw_output = generate_synthesis_dataset(dataset_description, num_samples)
  df = json_to_dataframe(raw_output)
  return df

In [None]:
# GRADIO UI

with gr.Blocks(title="Synthetic Dataset Generator") as demo:
    gr.Markdown(
        """
        # Synthetic Data Generator
        Generate realistic sample datasets from a simple description.
        """
    )

    with gr.Row():
        with gr.Column(scale=2):
            desc = gr.Textbox(
                label="Dataset Description",
                placeholder="e.g., Student records with name, age, course, GPA",
                lines=2
            )
        with gr.Column(scale=1):
            samples = gr.Dropdown(
                label="Number of Samples",
                choices=[3, 5, 10, 15, 20],
                value=5
            )
    gr.Examples(
        examples=[
            ["Student records with name, age, course, GPA", 5],
            ["Employee records with ID, name, age, salary", 10],
            ["Hospital patient records with ID, name, disease, admission date", 5],
        ],
        inputs=[desc, samples],
        label="Example Dataset Descriptions"
    )

    with gr.Row():
        btn = gr.Button(" Generate ", variant="primary")
        clear = gr.Button(" Clear ")

    output = gr.Dataframe(
        label="Synthetic Dataset",
        interactive=False,
        show_row_numbers=False
    )

    btn.click(
        fn=gradio_generate,
        inputs=[desc, samples],
        outputs=output
    )

    clear.click(
        fn=lambda: ("", None),
        inputs=[],
        outputs=[desc, output]
    )

demo.launch(share=True, debug=True)
