# Generating Dataset for #1 Pipeline: Model Classification 


In [41]:
import csv
import random
from tqdm import tqdm

In [42]:
# prompt
general_prompt_template = [
    "I have {SYMPTOM_OR_DISEASE}",
    "I feel {SYMPTOM_OR_DISEASE}",
    "Do I have {SYMPTOM_OR_DISEASE}?",
    "Is it possible that I have {SYMPTOM_OR_DISEASE}?"
]

desc_prompt_template = [
    "Can you tell me what is {SYMPTOM_OR_DISEASE}?",
    "What is {SYMPTOM_OR_DISEASE}?",
    "What is the meaning of {SYMPTOM_OR_DISEASE}?",
    "What is the definition of {SYMPTOM_OR_DISEASE}?",
    "What is the description of {SYMPTOM_OR_DISEASE}?",
    "Can you describe {SYMPTOM_OR_DISEASE}?"
]

symptom_prompt_template = [
    "I have {SYMPTOMS}, what is the name of this disease?",
    "I feel {SYMPTOMS}, what disease is this?"
]

precaution_prompt_template = [
    "How to cure {DISEASE}?",
    "How to prevent {DISEASE}?",
    "What are the precautions for {DISEASE}?"
]

dont_answer_prompt_template = [
    "Tell me a joke.",
    "What's the weather today?",
    "How do I cook pasta?",
    "Can you recommend a good movie?",
    "What's the capital of France?",
    "What time is it in Tokyo?",
    "How to solve a Rubik's cube?",
    "Who won the football match yesterday?",
    "What's your favorite color?",
    "Do you like music?",
    "How old are you?",
    "What is the meaning of life?",
    "Can you help me with my homework?",
    "What's new in technology?",
    "Tell me about space exploration."
]

In [43]:
# load
def load_symptoms(filepath):
    diseases = []
    symptoms = set()
    with open(filepath, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            disease = row['Disease']
            disease_symptoms = [row[f'Symptom_{i}'].strip() for i in range(1, 18) if row[f'Symptom_{i}'].strip()]
            diseases.append(disease.strip())
            symptoms.update(disease_symptoms)
    return diseases, list(symptoms)

def load_precautions(filepath):
    precautions = {}
    with open(filepath, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            disease = row['Disease'].strip()
            precaution_list = [row[f'Precaution_{i}'].strip() for i in range(1, 5) if row[f'Precaution_{i}'].strip()]
            precautions[disease] = precaution_list
    return precautions

In [44]:
# save
def save_dataset(dataset, filepath):
    keys = dataset[0].keys()
    with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(dataset)

In [45]:
def generate_prompts(templates, variables_list, label):
    prompts = []
    for variables in tqdm(variables_list, desc=label):
        for template in tqdm(templates, desc='Template', leave=False):
            prompt = template.format(**variables)
            prompts.append({'Prompt': prompt, 'Label': label})
    return prompts

def generate_dataset():
    diseases, symptoms = load_symptoms('data/symptoms.csv')
    precautions = load_precautions('data/disease_precaution.csv')

    all_symptoms_and_diseases = symptoms + diseases

    dataset = []
    general_dataset = []
    desc_dataset = []
    symptom_dataset = []
    precaution_dataset = []
    dont_answer_dataset = []

    # Set target count for each category
    N = 50

    # General prompts
    general_vars = [{'SYMPTOM_OR_DISEASE': item} for item in random.choices(all_symptoms_and_diseases, k=N)]
    temp = generate_prompts(general_prompt_template, general_vars, 'general')
    dataset.extend(temp)
    general_dataset.extend(temp)
    

    # Description prompts
    desc_vars = [{'SYMPTOM_OR_DISEASE': item} for item in random.choices(all_symptoms_and_diseases, k=N)]
    temp = generate_prompts(desc_prompt_template, desc_vars, 'desc')
    dataset.extend(temp)
    desc_dataset.extend(temp)
    
    # Symptom prompts
    symptom_vars = []
    for _ in range(N):
        num_symptoms = random.randint(1, 3)
        selected_symptoms = random.sample(symptoms, num_symptoms)
        symptom_vars.append({'SYMPTOMS': ', '.join(selected_symptoms)})
    temp = generate_prompts(symptom_prompt_template, symptom_vars, 'symptom')
    dataset.extend(temp)
    symptom_dataset.extend(temp)
    
    
    # Precaution prompts
    precaution_vars = [{'DISEASE': disease} for disease in random.choices(diseases, k=N)]
    temp = generate_prompts(precaution_prompt_template, precaution_vars, 'precaution')
    dataset.extend(temp)
    precaution_dataset.extend(temp)
    
    # Don't answer prompts
    dont_answer_vars = [{}]
    extended_dont_answer_templates = random.choices(dont_answer_prompt_template, k=N)
    temp = generate_prompts(dont_answer_prompt_template, dont_answer_vars, 'dont answer')
    dataset.extend(temp)
    dont_answer_dataset.extend(temp)
    
    # Shuffle the dataset
    random.shuffle(dataset)

    return dataset, general_dataset, desc_dataset, symptom_dataset, precaution_dataset, dont_answer_dataset

In [46]:
dataset, general_dataset, desc_dataset, symptom_dataset, precaution_dataset, dont_answer_dataset = generate_dataset()
save_dataset(dataset, 'data/generated_all_dataset.csv')
save_dataset(general_dataset, 'data/generated_general_dataset.csv')
save_dataset(desc_dataset, 'data/generated_desc_dataset.csv')
save_dataset(symptom_dataset, 'data/generated_symptom_dataset.csv')
save_dataset(precaution_dataset, 'data/generated_precaution_dataset.csv')
save_dataset(dont_answer_dataset, 'data/generated_dont_answer_dataset.csv')

general: 100%|██████████| 50/50 [00:00<00:00, 322.61it/s]
desc: 100%|██████████| 50/50 [00:00<00:00, 402.67it/s]
symptom: 100%|██████████| 50/50 [00:00<00:00, 380.92it/s]
precaution: 100%|██████████| 50/50 [00:00<00:00, 378.01it/s]
dont answer: 100%|██████████| 1/1 [00:00<00:00, 249.48it/s]
