### Imports

In [1]:
import concurrent.futures
import json
import logging
import os
import requests
from requests.exceptions import Timeout

import openai

### Logger setup

In [3]:
app_logger = logging.getLogger('my_app_logger')
app_logger.setLevel(logging.INFO)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)

app_logger.addHandler(ch)

### ICD code labels extraction tools

In [None]:
icd_labels_file = "icd_code_labels.jsonl"

In [5]:
def get_icd_code_labels_dict(jsonl_path):
    code_labels = {}
    
    with open(jsonl_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_line = json.loads(line.strip())
            code_labels[json_line['code']] = json_line['label']
    
    return code_labels

### Synhtetic data generation tools

In [7]:
def generate_prompt(icd_code, label, max_words):
    return f"""
    Think up a text in Russian that describes patient's symptoms for the diagnosis: {icd_code} ({label}).
    The text should be concise, not exceeding {max_words} words.
    The text should be creative and not stereotype, the text must not contain the ICD code or the diagnosis name.
    The text should be realistic and diverse so that it could actually appear in person's EHR.
    Your response should be only the symptoms text, written in Russian.
    """

In [18]:
def generate_synthetic_symptoms(
    run_idx,
    diagnosis_code,
    diagnosis_label,
    model_version,
    max_words,
    count,
    temperature,
    timeout_seconds, 
    max_retries
):
    synthetic_symptoms = []
    
    prompt = generate_prompt(diagnosis_code, diagnosis_label, max_words)
    
    for i in range(count):
        retries = 0
        while retries < max_retries:
            try:
                response = requests.post(
                    "https://api.openai.com/v1/chat/completions",
                    json={
                        "model": model_version,
                        "messages": [
                            {"role": "user", "content": prompt}
                        ],
                        "temperature": temperature,
                        "max_tokens": 512,
                    },
                    headers={
                        "Authorization": f"Bearer {api_key}"
                    },
                    timeout=timeout_seconds
                )
                
                if response.status_code == 200:
                    generated_text = response.json()['choices'][0]['message']['content']
                    synthetic_symptoms.append({"idx": f"synthetic_{diagnosis_code}_{run_idx}_{i}", "symptoms": generated_text, "code": diagnosis_code})
                    app_logger.info(f"Completed {diagnosis_code}, {i}")
                    break
                else:
                    app_logger.error(f"Failed to generate symptoms for {diagnosis_code}, {i}: {response.text}")
                    retries += 1
                    
            except Timeout:
                app_logger.error(f"Request timed out for {diagnosis_code}, {i}")
                retries += 1
            except Exception as e:
                app_logger.error(f"An error occurred: {e}")
                retries += 1

            if retries >= max_retries:
                app_logger.error(f"Max retries reached for {diagnosis_code}, {i}")
                break

    return synthetic_symptoms

### Configuration

In [None]:
api_key = "" # OpenAI access key
run_idx = 0 # global index to differentiate generation sessions
max_words = 120 # max words for the synthetic symptoms
count_per_icd = 25 # how many synthetics entries to generate per each diagnosis
temperature = 0.8 # a value from 0 to 1, determines the randomness and creativity of synthetic samples
model_version = "gpt-4" # model version, valid models can be found on the OpenAI website

max_workers = 4 # number of workers for the thread pool, can be adjusted depending on OpenAI limit rates and available hardware
timeout_seconds = 25 # time before retrying generation of synthetic symptoms
max_retries = 5 # max retries before skipping the generation of synthetic symptom

code_labels = get_icd_code_labels_dict("icd_code_labels.jsonl")
output_name = f"gpt_4_RuMedTop3_{run_idx}.jsonl"

### Execution

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(
            generate_synthetic_symptoms, 
            run_idx, 
            code, 
            label,
            model_version,
            max_words,
            count_per_icd,
            temperature,
            timeout_seconds,
            max_retries
        ) 
        for code, label in code_labels.items()
    ]
    
    results = [future.result() for future in concurrent.futures.as_completed(futures)]

all_synthetic_symptoms = [item for sublist in results for item in sublist]

### Saving the results to a jsonl file (RuMedTop3 format)

In [None]:
with open(output_name, 'w', encoding="utf-8") as outfile:
    for entry in all_synthetic_symptoms:
        json.dump(entry, outfile, ensure_ascii=False)
        outfile.write('\n')