In [None]:
from openai import OpenAI
from IPython.display import Markdown, display, update_display
from google.colab import userdata
from huggingface_hub import login

In [None]:
openai_api_key = userdata.get("OPENAI_API_KEY")
openai = OpenAI(api_key = openai_api_key)

In [None]:
system_message = """
You are a synthetic test data generator.

Your task is to create **realistic test datasets** in tabular format, based on the user's input.

Follow these instructions:
1. Output must always be in Markdown table format using `|` separators and `---` under headers.
2. First row: Column headers.
3. Second row: Dashes (---) to format the table.
4. Following rows: Data rows, one per line.
5. If the user provides column names, use those. Otherwise, create realistic column names.
6. If the user provides a specific domain (e.g., HR, healthcare, banking), generate data accordingly.
7. Generate only the number of rows requested.
8. Never include extra text or explanation — only the table.

Example format:

| Name        | Email                   | Date       |
|-------------|--------------------------|------------|
| John Smith  | john.smith@email.com     | 2023-05-01 |
| ...         | ...                      | ...        |
"""

In [None]:
def get_messages(user_prompt):
  messages = [
      {"role":"system","content":system_message},
      {"role":"user","content":user_prompt}
  ]
  return messages

In [None]:
def get_response(user_prompt):
  stream = openai.chat.completions.create(
      model = "gpt-4o-mini",
      messages = get_messages(user_prompt),
      stream = True
  )
  response = ""
  display_handle = display(Markdown(""), display_id=True)
  for chunk in stream:
      response += chunk.choices[0].delta.content or ''
      response = response.replace("```","").replace("markdown", "")
      update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
test = """
Generate 15 rows of synthetic test data in the **Human Resources** domain.

The output should be in **tabular format** using Markdown syntax. Each row should represent an employee record. Include the following columns:

- Full Name
- Employee ID
- Department
- Role
- Joining Date
- Email Address
- Salary (in USD)

Make the data look realistic and varied. Ensure that the Employee IDs follow a consistent pattern (e.g., EMP1234), and salaries range from $40,000 to $120,000.

Output only the table and nothing else.
"""

In [None]:
credit = """
Generate 20 rows of synthetic tabular data for a credit default risk prediction model.

Output the dataset in **Markdown table format**, with the following columns:

- CustomerID (use IDs like CUST0001, CUST0002, etc.)
- Age (in years)
- AnnualIncome (in USD)
- EmploymentStatus (Employed, Unemployed, Self-Employed, Student)
- CreditScore (between 300 and 850)
- NumOpenCreditLines (integer between 0 and 10)
- OutstandingDebt (in USD)
- LoanPurpose (e.g., Car, Home, Education, Business, Personal)
- PreviousDefaults (integer count)
- Defaulted (Yes or No)

Make the data realistic. Vary the values logically — for example, customers with multiple previous defaults and low credit scores are more likely to have “Defaulted = Yes”.

Output only the table and nothing else.
"""

In [None]:
data = get_response(credit)
data

| CustomerID | Age | AnnualIncome | EmploymentStatus | CreditScore | NumOpenCreditLines | OutstandingDebt | LoanPurpose | PreviousDefaults | Defaulted |
|------------|-----|--------------|-------------------|-------------|---------------------|------------------|-------------|------------------|-----------|
| CUST0001   | 25  | 45000        | Employed           | 720         | 4                   | 15000            | Car         | 0                | No        |
| CUST0002   | 34  | 85000        | Employed           | 640         | 5                   | 20000            | Home        | 1                | Yes       |
| CUST0003   | 40  | 65000        | Self-Employed      | 580         | 3                   | 30000            | Business     | 2                | Yes       |
| CUST0004   | 29  | 30000        | Unemployed         | 520         | 2                   | 8000             | Personal     | 1                | Yes       |
| CUST0005   | 22  | 35000        | Student            | 700         | 1                   | 2000             | Education     | 0                | No        |
| CUST0006   | 45  | 120000       | Employed           | 750         | 6                   | 18000            | Home        | 0                | No        |
| CUST0007   | 31  | 47000        | Employed           | 600         | 3                   | 22000            | Personal     | 3                | Yes       |
| CUST0008   | 28  | 28000        | Unemployed         | 540         | 0                   | 6000             | Car         | 1                | Yes       |
| CUST0009   | 38  | 60000        | Self-Employed      | 720         | 4                   | 15000            | Business     | 0                | No        |
| CUST0010   | 55  | 90000        | Employed           | 800         | 7                   | 5000             | Home        | 0                | No        |
| CUST0011   | 23  | 39000        | Student            | 450         | 1                   | 5000             | Education     | 2                | Yes       |
| CUST0012   | 27  | 32000        | Unemployed         | 500         | 0                   | 12000            | Personal     | 1                | Yes       |
| CUST0013   | 42  | 72000        | Self-Employed      | 600         | 2                   | 25000            | Car         | 1                | Yes       |
| CUST0014   | 36  | 85000        | Employed           | 770         | 5                   | 10000            | Business     | 0                | No        |
| CUST0015   | 30  | 48000        | Employed           | 610         | 3                   | 16000            | Personal     | 1                | Yes       |
| CUST0016   | 47  | 95000        | Employed           | 790         | 8                   | 3000             | Home        | 0                | No        |
| CUST0017   | 32  | 53000        | Self-Employed      | 550         | 4                   | 12000            | Business     | 2                | Yes       |
| CUST0018   | 21  | 28000        | Student            | 480         | 1                   | 4000             | Education     | 3                | Yes       |
| CUST0019   | 49  | 110000       | Employed           | 850         | 9                   | 4000             | Home        | 0                | No        |
| CUST0020   | 37  | 69000        | Unemployed         | 610         | 2                   | 20000            | Personal     | 1                | Yes       |

Using open-source models:

In [None]:
hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct"
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MIXTRAL_7B = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
STABLE = "stabilityai/stablelm-3b-4e1t"
ZEPHYR = "TheBloke/zephyr-7B-beta-GGUF"
QWEN_MINI = "Qwen/Qwen1.5-0.5B-Chat"

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

In [None]:
system_message = """
You are a synthetic test data generator.

Your task is to create **realistic test datasets** in tabular format, based on the user's input.

Follow these instructions:
1. Output must always be in Markdown table format using `|` separators and `---` under headers.
2. First row: Column headers.
3. Second row: Dashes (---) to format the table.
4. Following rows: Data rows, one per line.
5. If the user provides column names, use those. Otherwise, create realistic column names.
6. If the user provides a specific domain (e.g., HR, healthcare, banking), generate data accordingly.
7. Generate only the number of rows requested.
8. Never include extra text or explanation — only the table.

Example format:

| Name        | Email                   | Date       |
|-------------|--------------------------|------------|
| John Smith  | john.smith@email.com     | 2023-05-01 |
| ...         | ...                      | ...        |
"""

In [None]:
def return_messages(user_prompt):
  return [
      {"role":"system","content":system_message},
      {"role":"user","content":user_prompt}
  ]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
user_prompt = """
Generate 15 rows of synthetic test data in the **Human Resources** domain.

The output should be in **tabular format** using Markdown syntax. Each row should represent an employee record. Include the following columns:

- Full Name
- Employee ID
- Department
- Role
- Joining Date
- Email Address
- Salary (in USD)

Make the data look realistic and varied. Ensure that the Employee IDs follow a consistent pattern (e.g., EMP1234), and salaries range from $40,000 to $120,000.

Output only the table and nothing else.
"""

In [None]:
tokenizer = AutoTokenizer.from_pretrained(GEMMA2, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
messages_for_template = [msg for msg in return_messages(user_prompt) if msg['role'] != 'system']
inputs = tokenizer.apply_chat_template(messages_for_template, return_tensors="pt").to("cuda")

In [None]:
model = AutoModelForCausalLM.from_pretrained(GEMMA2, device_map='auto', quantization_config=quant_config)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [None]:
# !pip install -U bitsandbytes
# !pip install -U transformers accelerate

In [None]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2304, bias=False)
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear4bit(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_

In [None]:
streamer = TextStreamer(tokenizer)

In [None]:
output = model.generate(inputs,streamer=streamer, max_new_tokens=1024)
display(Markdown(tokenizer.decode(output[0])))

<bos><start_of_turn>user
Generate 15 rows of synthetic test data in the **Human Resources** domain.

The output should be in **tabular format** using Markdown syntax. Each row should represent an employee record. Include the following columns:

- Full Name
- Employee ID
- Department
- Role
- Joining Date
- Email Address
- Salary (in USD)

Make the data look realistic and varied. Ensure that the Employee IDs follow a consistent pattern (e.g., EMP1234), and salaries range from $40,000 to $120,000.

Output only the table and nothing else.<end_of_turn>


| Full Name | Employee ID | Department | Role | Joining Date | Email Address | Salary |
|---|---|---|---|---|---|---|
| John Doe | EMP1234 | Marketing | Marketing Manager | 2021-01-15 | john.doe@example.com | $85,000 |
| Jane Smith | EMP5678 | Finance | Financial Analyst | 2022-05-20 | jane.smith@example.com | $75,000 |
| David Lee | EMP9012 | Sales | Sales Representative | 2020-09-10 | david.lee@example.com | $90,000 |
| Sarah Jones | EMP

<bos><start_of_turn>user
Generate 15 rows of synthetic test data in the **Human Resources** domain.

The output should be in **tabular format** using Markdown syntax. Each row should represent an employee record. Include the following columns:

- Full Name
- Employee ID
- Department
- Role
- Joining Date
- Email Address
- Salary (in USD)

Make the data look realistic and varied. Ensure that the Employee IDs follow a consistent pattern (e.g., EMP1234), and salaries range from $40,000 to $120,000.

Output only the table and nothing else.<end_of_turn>


| Full Name | Employee ID | Department | Role | Joining Date | Email Address | Salary |
|---|---|---|---|---|---|---|
| John Doe | EMP1234 | Marketing | Marketing Manager | 2021-01-15 | john.doe@example.com | $85,000 |
| Jane Smith | EMP5678 | Finance | Financial Analyst | 2022-05-20 | jane.smith@example.com | $75,000 |
| David Lee | EMP9012 | Sales | Sales Representative | 2020-09-10 | david.lee@example.com | $90,000 |
| Sarah Jones | EMP1113 | HR | HR Generalist | 2021-08-25 | sarah.jones@example.com | $60,000 |
| Michael Brown | EMP2223 | IT | Network Engineer | 2022-03-10 | michael.brown@example.com | $110,000 |
| Emily Wilson | EMP3334 | Marketing | Content Writer | 2020-07-15 | emily.wilson@example.com | $45,000 |
| Robert Garcia | EMP4445 | Sales | Sales Manager | 2021-02-01 | robert.garcia@example.com | $120,000 |
| Christopher Miller | EMP5556 | Finance | Accountant | 2022-09-15 | christopher.miller@example.com | $55,000 |
| Ashley Rodriguez | EMP6667 | HR | Recruiter | 2020-04-01 | ashley.rodriguez@example.com | $65,000 |
| Matthew Davis | EMP7778 | IT | Systems Administrator | 2021-06-10 | matthew.davis@example.com | $70,000 |
| Jessica Taylor | EMP8889 | Marketing | Marketing Specialist | 2022-01-01 | jessica.taylor@example.com | $50,000 |
| Daniel Thompson | EMP9990 | Sales | Sales Representative | 2020-03-15 | daniel.thompson@example.com | $80,000 |
| Amanda Nguyen | EMP1010 | Finance | Financial Analyst | 2021-07-10 | amanda.nguyen@example.com | $70,000 |
| Thomas Jackson | EMP1111 | IT | Software Developer | 2022-05-25 | thomas.jackson@example.com | $100,000 |
| Isabella Garcia | EMP1212 | HR | Training Specialist | 2020-09-01 | isabella.garcia@example.com | $60,000 | 
| Kevin Lee | EMP1313 | Sales | Sales Manager | 2021-08-01 | kevin.lee@example.com | $95,000 | 
| 
<end_of_turn>

In [None]:
def generate(messages):
  tokenizer = AutoTokenizer.from_pretrained(GEMMA2)
  tokenizer.pad_token = tokenizer.eos_token
  messages_for_template = [msg for msg in return_messages(messages) if msg['role'] != 'system']
  inputs = tokenizer.apply_chat_template(messages_for_template, return_tensors="pt").to("cuda")
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(GEMMA2, device_map="auto", quantization_config=quant_config)
  outputs = model.generate(inputs, max_new_tokens=1024, streamer=streamer)
  display(Markdown(tokenizer.decode(output[0])))
  del model, inputs, tokenizer, outputs, streamer
  gc.collect()
  torch.cuda.empty_cache()

In [None]:
generate(user_prompt)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<bos><start_of_turn>user
Generate 15 rows of synthetic test data in the **Human Resources** domain.

The output should be in **tabular format** using Markdown syntax. Each row should represent an employee record. Include the following columns:

- Full Name
- Employee ID
- Department
- Role
- Joining Date
- Email Address
- Salary (in USD)

Make the data look realistic and varied. Ensure that the Employee IDs follow a consistent pattern (e.g., EMP1234), and salaries range from $40,000 to $120,000.

Output only the table and nothing else.<end_of_turn>


| Full Name | Employee ID | Department | Role | Joining Date | Email Address | Salary |
|---|---|---|---|---|---|---|
| John Doe | EMP1234 | Marketing | Marketing Manager | 2021-01-15 | john.doe@example.com | $85,000 |
| Jane Smith | EMP5678 | Finance | Financial Analyst | 2022-05-20 | jane.smith@example.com | $75,000 |
| David Lee | EMP9012 | Sales | Sales Representative | 2020-09-10 | david.lee@example.com | $90,000 |
| Sarah Jones | EMP

<bos><start_of_turn>user
Generate 15 rows of synthetic test data in the **Human Resources** domain.

The output should be in **tabular format** using Markdown syntax. Each row should represent an employee record. Include the following columns:

- Full Name
- Employee ID
- Department
- Role
- Joining Date
- Email Address
- Salary (in USD)

Make the data look realistic and varied. Ensure that the Employee IDs follow a consistent pattern (e.g., EMP1234), and salaries range from $40,000 to $120,000.

Output only the table and nothing else.<end_of_turn>


| Full Name | Employee ID | Department | Role | Joining Date | Email Address | Salary |
|---|---|---|---|---|---|---|
| John Doe | EMP1234 | Marketing | Marketing Manager | 2021-01-15 | john.doe@example.com | $85,000 |
| Jane Smith | EMP5678 | Finance | Financial Analyst | 2022-05-20 | jane.smith@example.com | $75,000 |
| David Lee | EMP9012 | Sales | Sales Representative | 2020-09-10 | david.lee@example.com | $90,000 |
| Sarah Jones | EMP1113 | HR | HR Generalist | 2021-08-25 | sarah.jones@example.com | $60,000 |
| Michael Brown | EMP2223 | IT | Network Engineer | 2022-03-10 | michael.brown@example.com | $110,000 |
| Emily Wilson | EMP3334 | Marketing | Content Writer | 2020-07-15 | emily.wilson@example.com | $45,000 |
| Robert Garcia | EMP4445 | Sales | Sales Manager | 2021-02-01 | robert.garcia@example.com | $120,000 |
| Christopher Miller | EMP5556 | Finance | Accountant | 2022-09-15 | christopher.miller@example.com | $55,000 |
| Ashley Rodriguez | EMP6667 | HR | Recruiter | 2020-04-01 | ashley.rodriguez@example.com | $65,000 |
| Matthew Davis | EMP7778 | IT | Systems Administrator | 2021-06-10 | matthew.davis@example.com | $70,000 |
| Jessica Taylor | EMP8889 | Marketing | Marketing Specialist | 2022-01-01 | jessica.taylor@example.com | $50,000 |
| Daniel Thompson | EMP9990 | Sales | Sales Representative | 2020-03-15 | daniel.thompson@example.com | $80,000 |
| Amanda Nguyen | EMP1010 | Finance | Financial Analyst | 2021-07-10 | amanda.nguyen@example.com | $70,000 |
| Thomas Jackson | EMP1111 | IT | Software Developer | 2022-05-25 | thomas.jackson@example.com | $100,000 |
| Isabella Garcia | EMP1212 | HR | Training Specialist | 2020-09-01 | isabella.garcia@example.com | $60,000 | 
| Kevin Lee | EMP1313 | Sales | Sales Manager | 2021-08-01 | kevin.lee@example.com | $95,000 | 
| 
<end_of_turn>

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# model_id = "Qwen/Qwen1.5-0.5B-Chat"
# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
# # ans = pipe(user_prompt)
# # display(Markdown(ans[0]))