In [1]:
from dotenv import load_dotenv
import os
load_dotenv(verbose=False)

import google.generativeai as genai
import json

In [2]:
def load_data(fname):
    try:
        with open(fname, 'r') as file:
            data = json.load(file)
    except FileNotFoundError:
        data = []
    return data 

def save_data(output_file, data):
    with open(output_file, 'w') as file:
        file.write(json.dumps(data, indent=4))

In [3]:
PROMPT = """Taking on the persona of data scientist. I am doing research to carry out information extraction model trained by data build from scratch. Now I need data from my model training.  I want to generate some real-world examples of information extraction related Medical Records Extraction. 
Specifically, an example should contain the following two items:
1. text: [paragraphs from unstructured clinical notes] 
2. extraction: [
    Patient ID: Unique identifier for the patient.
    Date of Visit: The date when the patient was seen by a healthcare provider.
    Allergy Type: Specific allergens (e.g., penicillin, peanuts).
    Medication Dosage: Dosage details for each prescribed medication.
    Lab Test Date: Dates when lab tests were conducted.
    Test Results: Specific results from lab tests (e.g., blood glucose level).
    Referring Physician: Name of the doctor who referred the patient.
    Follow-Up Date: Scheduled date for the next visit or follow-up.
    Patient Weight: Recorded weight of the patient during the visit.
    Insurance Information: Details about the patient's insurance provider.]
""" 

In [4]:
NOTE = "\nFollowing the format of the examples above, I would like you to help me generate 20 examples that meet the following requirements:\n \
1. These examples should be described in different styles.  \n \
2. The generated text do not overlap structure of paragraph, expression, methods of description, paraphrase words,etc. \n \
3. The generated text should be coherent and meaningful. \n\n \
Remember data must be returned by json format\n"

In [5]:
def generate_data(seeds: list, model="gemini-1.5-flash-002"):
    str_seed = ""
    for seed in seeds:
        str_seed += f'\ntext: "{seed["text"]}"\nextraction: {seed["extraction"]}\n'
        
    genai.configure(api_key=os.getenv("API_GEMINI_KEY"))
    model = genai.GenerativeModel("gemini-1.5-flash-002")
    response = model.generate_content(PROMPT + str_seed + NOTE)
    return json.loads(response.text.replace("```json\n", "").replace("\n```", ""))

In [6]:
SEEDS_FILE = 'Data/seeds.json'
MAX_SAMPLES_TRAIN = 70000
MAX_SAMPLES_TEST = 2000
MAX_SAMPLES_VAL = 6000
TRAINING_DATA_FILE = 'Data/train.json'
TEST_DATA_FILE = 'Data/test.json'
VAL_DATA_FILE = 'Data/val.json'

In [7]:
def build_data(SEEDS_FILE: str, MAX_SAMPLES: int, DATA_FILE: str):
    seeds = load_data(SEEDS_FILE)
    data  = load_data(DATA_FILE)
    while len(data) < MAX_SAMPLES:
        new_data = generate_data(seeds)
        data += new_data
        save_data(DATA_FILE, data)
        print(len(data), "samples generated")

In [None]:
build_data(SEEDS_FILE, MAX_SAMPLES_TRAIN, TRAINING_DATA_FILE)

51361 samples generated
51382 samples generated
51400 samples generated
51424 samples generated
51448 samples generated
51468 samples generated
51489 samples generated
51513 samples generated
51533 samples generated
51554 samples generated
51578 samples generated
51599 samples generated
51621 samples generated
51639 samples generated
51662 samples generated
51686 samples generated
51709 samples generated
51730 samples generated
51751 samples generated
51772 samples generated
51796 samples generated
51819 samples generated
51840 samples generated
51860 samples generated
51880 samples generated
51904 samples generated
51924 samples generated
51944 samples generated
51966 samples generated
51986 samples generated
52009 samples generated
52032 samples generated
52056 samples generated
52081 samples generated
52104 samples generated
52125 samples generated
52146 samples generated
52168 samples generated
52189 samples generated
52208 samples generated
52227 samples generated
52251 samples ge

In [29]:
build_data(SEEDS_FILE, MAX_SAMPLES_TEST, TEST_DATA_FILE)

855 samples generated
877 samples generated
899 samples generated
922 samples generated
942 samples generated
965 samples generated
984 samples generated
1002 samples generated
1020 samples generated
1043 samples generated
1065 samples generated
1087 samples generated
1108 samples generated
1132 samples generated
1153 samples generated
1174 samples generated
1195 samples generated
1219 samples generated
1243 samples generated
1265 samples generated
1289 samples generated
1310 samples generated
1331 samples generated
1353 samples generated
1376 samples generated
1397 samples generated
1415 samples generated
1439 samples generated
1461 samples generated
1480 samples generated
1501 samples generated
1525 samples generated
1548 samples generated
1569 samples generated
1588 samples generated
1612 samples generated
1634 samples generated
1658 samples generated
1678 samples generated
1698 samples generated
1718 samples generated
1742 samples generated
1763 samples generated
1783 samples gener

In [9]:
build_data(SEEDS_FILE, MAX_SAMPLES_VAL, VAL_DATA_FILE)

4030 samples generated
4052 samples generated
4071 samples generated
4095 samples generated
4115 samples generated
4134 samples generated
4154 samples generated
4174 samples generated
4198 samples generated
4217 samples generated
4241 samples generated
4264 samples generated
4288 samples generated
4311 samples generated
4332 samples generated
4353 samples generated
4373 samples generated
4395 samples generated
4416 samples generated
4439 samples generated
4463 samples generated
4484 samples generated
4502 samples generated
4523 samples generated
4541 samples generated
4562 samples generated
4585 samples generated
4608 samples generated
4630 samples generated
4650 samples generated
4669 samples generated
4687 samples generated
4707 samples generated
4727 samples generated
4747 samples generated
4764 samples generated
4786 samples generated
4810 samples generated
4833 samples generated
4857 samples generated
4878 samples generated
4898 samples generated
4917 samples generated
4940 sample

In [10]:
train = load_data(TRAINING_DATA_FILE)
test = load_data(TEST_DATA_FILE)
val = load_data(VAL_DATA_FILE)

len(train), len(test), len(val)

(51338, 2009, 6000)