<a href="https://colab.research.google.com/github/JacquieAM/ai-projects/blob/main/quantum_synthetic_data_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai
!pip install gradio

In [None]:
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# import torch, json


In [None]:
# imports
import gradio as gr
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import json

In [None]:
torch.cuda.is_available()


In [None]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
 system_message = (
      "You are a specialized synthetic data generator that produces"
      "structured, JSON-formatted datasets for quantum computing research and simulations."
  )

In [None]:
user_message = (
    "Generate 5 synthetic quantum circuit configurations. "
    "Each record must include: circuit_id, number_of_qubits (1-20), "
    "gate_types (choose from H, X, Y, Z, CNOT, T, S), depth (1-50), "
    "fidelity (0-1), error_rate (0-0.1). Output as a JSON array."
)

In [None]:
num_records = 5

In [None]:
prompt_templates = {
    "Quantum Circuits":(
        f"Generate {num_records} synthetic quantum circuit configurations."
        "Each record must include: circuit_id, number_of_qubits (1-20),"
        "gate_types (choose from H, X, Z, CNOT, T, S), depth (1-50),"
        "fidelity (0-1), error_rate (0-0.1). Output as a JSON array."
    ),
    "Quantum Experiment Logs":(
        f"Generate {num_records} synthetic quantum experiment logs."
        "Each record must include: experiment_id, timestamp, qubit_state ( |0>, |>, superposition), "
        "measurement_result (matches qubit_state), decoherence_time (0.1-100 microseconds). Output as JSON."
    ),
    "Quantum Research Abstracts": (
        f"Generate {num_records} synthetic quantum research abstracts."
        "Each record must include: title, abstract_text, keywords, year (2000-2025)."
        "Focus on coherence and topic relevance. Output as JSON."
    )
}

In [None]:
prompt = f"<|system|>\n{system_message}\n<|user|>\n{user_message}\n<|assistant|>\n"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
# inputs = tokenizer(prompt, return_tensors="pt")
model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto",
    # device_map=None,
    quantization_config=quant_config
)
outputs = model.generate(**inputs, max_new_tokens=200)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
import json
import pandas as pd

def generate_data(dataset_type, num_records, file_format):
    # Prepare prompt for the model
    user_message = prompt_templates[dataset_type].format(num_records=num_records)
    prompt = f"<|system|>\n{system_message}\n<|user|>\n{user_message}\n<|assistant|>\n"

    # Tokenize and generate model output
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=800)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse only the first JSON array
    try:
        # Find first opening bracket
        start = text.find("[")
        if start == -1:
            raise ValueError("No JSON array found in model output")

        # Count brackets to find the matching closing bracket
        open_brackets = 0
        end = start
        while end < len(text):
            if text[end] == "[":
                open_brackets += 1
            elif text[end] == "]":
                open_brackets -= 1
                if open_brackets == 0:
                    break
            end += 1
        else:
            raise ValueError("No matching closing bracket found")

        data = json.loads(text[start:end+1])
    except Exception as e:
        error_data = [{"error": f"JSON parsing error: {str(e)}", "raw_text": text}]
        return json.dumps(error_data, indent=2)

    # CSV conversion (keep your existing logic)
    if file_format.upper() == "CSV":
        try:
            csv_data = []
            for row in data:
                row_copy = row.copy()
                # Fix qubit_state for readability in CSV
                if "qubit_state" in row_copy:
                    if row_copy["qubit_state"] in ["|>", "superposition"]:
                        row_copy["qubit_state"] = "|0> + |1>"
                csv_data.append(row_copy)

            df = pd.DataFrame(csv_data)
            return df.to_csv(index=False)
        except Exception as e:
            return f"CSV conversion error: {str(e)}\n\nData:\n{data}"

    # Return JSON string if not CSV
    return json.dumps(data, indent=2)


In [None]:
#gradio
dataset_types = [
    "Quantum Circuits",
    "Quantum Experiment Logs",
    "Quantum Research Abstracts"
]

file_formats = ["JSON", "CSV"]

with gr.Blocks(title="Quantum Synthetic Data Generator") as ui:
  gr.Markdown(
      """
      Quantum Synthetic Data Generator
      Generate realistic, structured **synthetic datasets** for quantum computing research, simulations, or educational purposes.
      Powered by open-source **Llama 3.1 8B Instruct**.
      """
  )

  dataset = gr.Dropdown(choices=dataset_types, label="Select Dataset Type", value="Quantum Circuits")
  count = gr.Slider(1, 20, step=1, value=5, label="Number of Records")
  file_format = gr.Dropdown(choices=file_formats, label="File Format", value="JSON")

  output = gr.Code(label="Generated Data", language="json")
  generate_btn = gr.Button("Generate Quantum Data")

  generate_btn.click(
      fn=generate_data,
      inputs=[dataset, count, file_format],
      outputs=output
  )

  ui.launch(share=True)