# Quick exploration

This notebook is a quick exploration of the data. It is not meant to be exhaustive, but rather to get a quick overview of the data.
As such, it is not meant to be clean or well documented.

## DDXPLUS

In [11]:
import json
import pandas as pd
import numpy as np

import random
import re

In [12]:
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

In [13]:
# Assuming the data is loaded from the respective JSON and CSV files
evidences = load_json('DDxPLUS/release_evidences.json')
conditions = load_json('DDxPLUS/release_conditions.json')
patients = pd.read_csv('DDxPLUS/release_train_patients.csv')  # Example for the training set

In [14]:
patients

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",M,IVRS ou virémie,"['crowd', 'diaph', 'douleurxx', 'douleurxx_car...",fievre
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",M,VIH (Primo-infection),"['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...",diaph
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",F,Pneumonie,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",expecto
3,34,"[['IVRS ou virémie', 0.23859396799565236], ['C...",F,IVRS ou virémie,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx
4,36,"[['IVRS ou virémie', 0.23677812769175735], ['P...",M,IVRS ou virémie,"['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...",toux
...,...,...,...,...,...,...
1025597,18,"[['Épiglottite', 0.28156957795466475], ['VIH (...",M,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",fievre
1025598,28,"[['Épiglottite', 0.3703962237298842], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_vive', 'doule...",fievre
1025599,0,"[['Épiglottite', 0.13193905052537108], ['Laryn...",F,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",stridor
1025600,26,"[['Épiglottite', 0.3028258988138983], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",stridor


In [15]:
def generate_medical_prompts(patients, evidences, conditions):
    prompts = []
    for _, patient in patients.iterrows():
        
        print(patient['PATHOLOGY'])

        ground_truth_key = patient.PATHOLOGY # Ground truth pathology
        ground_truth_name = conditions[ground_truth_key]['cond-name-eng'] # Ground truth pathology name                       
        symptoms = list(conditions[ground_truth_key]["symptoms"].keys())

        print(ground_truth_name)

        # Randomly select an evidence for the instruction
        initial_evidence = random.choice(patient['EVIDENCES'].split(';'))
        evidence_details = evidences.get(initial_evidence, {})

        print(initial_evidence)
        print(evidence_details)

        # Create the instruction based on the evidence
        instruction = f"Based on the following evidence: {evidence_details.get('question_en', 'N/A')}, what could be the potential diagnosis?"

        # Context can include patient's demographic data and initial evidence
        context = f"Patient Age: {patient['AGE']}, Sex: {patient['SEX']}, Initial Evidence: {patient['INITIAL_EVIDENCE']}"

        # Response is the ground truth pathology
        response = patient['PATHOLOGY']

        # Unify into a single prompt
        unified_prompt = f"Instruction: {instruction}\nContext: {context}\nResponse: {response}"
        prompts.append(unified_prompt)
    return prompts

generate_medical_prompts(
    patients.sample(1),
    evidences, 
    conditions
)

VIH (Primo-infection)
HIV (initial infection)
['atcd_its', 'diarrhee', 'douleurxx', 'douleurxx_carac_@_pénible', 'douleurxx_carac_@_sensible', 'douleurxx_carac_@_épuisante', 'douleurxx_endroitducorps_@_arrière_de_tête', 'douleurxx_endroitducorps_@_dessus_de_tête', 'douleurxx_endroitducorps_@_front', 'douleurxx_endroitducorps_@_tempe_D_', 'douleurxx_endroitducorps_@_tempe_G_', 'douleurxx_intens_@_3', 'douleurxx_irrad_@_nulle_part', 'douleurxx_precis_@_3', 'douleurxx_soudain_@_0', 'drogues_IV', 'fievre', 'gorge_dlr', 'itss_risque', 'lesions_peau', 'lesions_peau_couleur_@_pale', 'lesions_peau_desquame_@_N', 'lesions_peau_elevee_@_0', 'lesions_peau_endroitducorps_@_gencive_inférieure', 'lesions_peau_endroitducorps_@_grande_lèvre_D_', 'lesions_peau_endroitducorps_@_joue_interne_G_', 'lesions_peau_endroitducorps_@_lèvre_inferieure_D_', 'lesions_peau_endroitducorps_@_palais', 'lesions_peau_intens_@_2', 'lesions_peau_plusqu1cm_@_O', 'lesions_peau_prurit_@_0', 'msk_dlr', 'nausee', 'perte_poids'

['Instruction: Based on the following evidence: N/A, what could be the potential diagnosis?\nContext: Patient Age: 23, Sex: M, Initial Evidence: gorge_dlr\nResponse: VIH (Primo-infection)']

## Medical dialogue dataset

https://huggingface.co/datasets/medical_dialog

In [16]:
from datasets import load_dataset

In [17]:
dialogues = load_dataset("medical_dialog", "processed.en") #, "en", data_dir="DIALOGUES")

In [18]:
dialogues

DatasetDict({
    train: Dataset({
        features: ['description', 'utterances'],
        num_rows: 482
    })
    validation: Dataset({
        features: ['description', 'utterances'],
        num_rows: 60
    })
    test: Dataset({
        features: ['description', 'utterances'],
        num_rows: 61
    })
})

In [19]:
dialogues['train']

Dataset({
    features: ['description', 'utterances'],
    num_rows: 482
})

In [28]:
dialogues['train'][0].keys()

dict_keys(['description', 'utterances'])

In [35]:
dialogues['train'][0]["description"]

'throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.'

In [73]:
diag_list = []
for record in dialogues['train']:
    utt = record["utterances"]
    diag_list.append({
        "patient": utt[0].replace("patient: ", ""),
        "doctor": utt[1].replace("doctor: ", "")
    })

dialogue_df = pd.DataFrame(diag_list)

In [74]:
dialogue_df

Unnamed: 0,patient,doctor
0,throat a bit sore and want to get a good imune...,during this pandemic. throat pain can be from ...
1,"hey there i have had cold ""symptoms"" for over ...",yes. protection. it is not enough symptoms to ...
2,i have a tight and painful chest with a dry co...,"possible. top symptoms include fever, dry coug..."
3,what will happen after the incubation period f...,"in brief: symptoms if you are infected, sympto..."
4,just found out i was pregnant. yesterday diagn...,thanks for your question on healthcare magic.i...
...,...,...
477,my 5 year old son woke up not feeling well. i ...,"in brief: arrange testing stay home, provide f..."
478,i have a dry cough and sore throat- it's been ...,in brief: covid good guidelines can be found a...
479,how do i know if i have a normal cold or maybe...,common cold with sin. the corona virus causes ...
480,hi- i was diagnosed a month ago with community...,hello! just because you have previously had a ...


In [61]:
dialogue_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,patient: throat a bit sore and want to get a g...,doctor: during this pandemic. throat pain can ...,,,,,,,,,,,,,,,
1,"patient: hey there i have had cold ""symptoms"" ...",doctor: yes. protection. it is not enough symp...,,,,,,,,,,,,,,,
2,patient: i have a tight and painful chest with...,"doctor: possible. top symptoms include fever, ...",,,,,,,,,,,,,,,
3,patient: what will happen after the incubation...,doctor: in brief: symptoms if you are infected...,,,,,,,,,,,,,,,
4,patient: just found out i was pregnant. yester...,doctor: thanks for your question on healthcare...,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,patient: my 5 year old son woke up not feeling...,"doctor: in brief: arrange testing stay home, p...",,,,,,,,,,,,,,,
478,patient: i have a dry cough and sore throat- i...,doctor: in brief: covid good guidelines can be...,,,,,,,,,,,,,,,
479,patient: how do i know if i have a normal cold...,doctor: common cold with sin. the corona virus...,,,,,,,,,,,,,,,
480,patient: hi- i was diagnosed a month ago with ...,doctor: hello! just because you have previousl...,,,,,,,,,,,,,,,


## Diagnose-ME

https://www.kaggle.com/datasets/dsxavier/diagnoise-me

In [21]:
data = pd.read_json("diagnose-me/en_medical_dialog.json")

In [22]:
data

Unnamed: 0,id,Description,Doctor,Patient
0,0,Q. What does abutment of the nerve root mean?,Hi. I have gone through your query with dilige...,"Hi doctor,I am just wondering what is abutting..."
1,1,"Q. Every time I eat spicy food, I poop blood. ...",Hello. I have gone through your information an...,"Hi doctor, I am a 26 year old male. I am 5 fee..."
2,2,Q. Will Nano-Leo give permanent solution for e...,Hi. For further doubts consult a sexologist on...,"Hello doctor, I am 48 years old. I am experien..."
3,3,Q. Will Kalarchikai cure multiple ovarian cyst...,Hello. I just read your query. See Kalarachi K...,"Hello doctor, I have multiple small cysts in b..."
4,4,Q. I masturbate only by rubbing the tip of the...,Hi. For further doubts consult a sexologist on...,"Hi doctor, During masturbation I just rub the ..."
...,...,...,...,...
257464,257464,"Unprotected sex after periods, took morning af...",Hormonal method of birth control like pills an...,"Hello, I am , age 26 years old. On 7th of may,..."
257465,257465,"Delivered baby, plan for second child after 4-...",Do you know how this pills act and how your me...,okay so i got this loette pill right its a rea...
257466,257466,"Taking loette pill, have started half way thro...",Hi thanks for your question your taking contra...,taking the mini pill Cerazette and missed taki...
257467,257467,"On Cerazette, missed pills twice at night, fol...",Hi Cassctiexx Thanks for writing in to Healthc...,Hi I recently received the depo-provera shot o...


In [23]:
data["Doctor"] = data["Doctor"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()

In [24]:
data["Patient"] = data["Patient"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()

In [25]:
data["Description"] = data["Description"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()

In [26]:
data.drop(columns=["id"], inplace=True)
data.columns = ["desc", "doctor", "patient"]

In [27]:
data

Unnamed: 0,desc,doctor,patient
0,Q. What does abutment of the nerve root mean?,Hi. I have gone through your query with dilige...,"Hi doctor,I am just wondering what is abutting..."
1,"Q. Every time I eat spicy food, I poop blood. ...",Hello. I have gone through your information an...,"Hi doctor, I am a 26 year old male. I am 5 fee..."
2,Q. Will Nano-Leo give permanent solution for e...,Hi. For further doubts consult a sexologist on...,"Hello doctor, I am 48 years old. I am experien..."
3,Q. Will Kalarchikai cure multiple ovarian cyst...,Hello. I just read your query. See Kalarachi K...,"Hello doctor, I have multiple small cysts in b..."
4,Q. I masturbate only by rubbing the tip of the...,Hi. For further doubts consult a sexologist on...,"Hi doctor, During masturbation I just rub the ..."
...,...,...,...
257464,"Unprotected sex after periods, took morning af...",Hormonal method of birth control like pills an...,"Hello, I am , age 26 years old. On 7th of may,..."
257465,"Delivered baby, plan for second child after 4-...",Do you know how this pills act and how your me...,okay so i got this loette pill right its a rea...
257466,"Taking loette pill, have started half way thro...",Hi thanks for your question your taking contra...,taking the mini pill Cerazette and missed taki...
257467,"On Cerazette, missed pills twice at night, fol...",Hi Cassctiexx Thanks for writing in to Healthc...,Hi I recently received the depo-provera shot o...
