In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [2]:
# load the dataset
# json files
release_evidences_df = pd.read_json('..\\Datasets\\Patient-disease-symptom\\release_evidences.json')
release_conditions_df = pd.read_json('..\\Datasets\\Patient-disease-symptom\\release_conditions.json')
# csv files (the files we need to train the models)
training_df = pd.read_csv('..\\Datasets\\Patient-disease-symptom\\release_train_patients.csv')
testing_df = pd.read_csv('..\\Datasets\\Patient-disease-symptom\\release_test_patients.csv')
validate_df = pd.read_csv('..\\Datasets\\Patient-disease-symptom\\release_validate_patients.csv')

In [3]:
# Let's see the shape of the dataframes
print('release_evidences_df shape: ', release_evidences_df.shape)
print('release_conditions_df shape: ', release_conditions_df.shape)
print('training_df shape: ', training_df.shape)
print('testing_df shape: ', testing_df.shape)
print('validate_df shape: ', validate_df.shape)

release_evidences_df shape:  (9, 223)
release_conditions_df shape:  (7, 49)
training_df shape:  (1025602, 6)
testing_df shape:  (134529, 6)
validate_df shape:  (132448, 6)


In [4]:
# Let's explore the list of evidences
release_evidences_df.head(9)

Unnamed: 0,fievre,douleurxx_endroitducorps,douleurxx,douleurxx_irrad,douleurxx_carac,douleurxx_soudain,douleurxx_intens,douleurxx_precis,pdc,lesions_peau_endroitducorps,...,tmine,tconst,j34.2,j33,momasthma,suburb,rural,z80.1,z80.0,J81
name,fievre,douleurxx_endroitducorps,douleurxx,douleurxx_irrad,douleurxx_carac,douleurxx_soudain,douleurxx_intens,douleurxx_precis,pdc,lesions_peau_endroitducorps,...,tmine,tconst,j34.2,j33,momasthma,suburb,rural,z80.1,z80.0,J81
code_question,fievre,douleurxx,douleurxx,douleurxx,douleurxx,douleurxx,douleurxx,douleurxx,pdc,lesions_peau,...,tmine,tconst,j34.2,j33,momasthma,suburb,rural,z80.1,z80.0,J81
question_fr,Avez-vous objectivé ou ressenti de la fièvre?,Avez-vous de la douleur quelque part?,Avez-vous de la douleur à quelque part en lien...,Est-ce que la douleur se propage vers un autre...,Caractérisez votre douleur:,À quelle vitesse la douleur est-elle apparue ?,Quelle est l’intensité de la douleur?,À quel point la douleur est-elle précisément l...,Avez-vous perdu conscience?,À quel endroit est situé chaque lésion?,...,Travaillez-vous dans le domaine minier?,Travaillez-vous dans le domaine de la construc...,Êtes-vous connu pour avoir une déviation du se...,Avez-vous des polypes dans le nez?,Avez-vous une mère qui souffre d’asthme?,Habitez-vous en banlieue?,Habitez-vous en à la campagne?,Avez-vous des membres de votre famille ayant e...,Avez-vous des membre de votre famille proche q...,Avez-vous déjà fait de l’eau sur les poumons?
question_en,Do you have a fever (either felt or measured w...,Do you feel pain somewhere?,"Do you have pain somewhere, related to your re...",Does the pain radiate to another location?,Characterize your pain:,How fast did the pain appear?,How intense is the pain?,How precisely is the pain located?,Did you lose consciousness?,Where is the affected region located?,...,Do you work in the mining sector?,Do you work in construction?,Do you have a deviated nasal septum?,Do you have polyps in your nose?,Does your mother suffer from asthma?,Do you live in the suburbs?,Do you live in a rural area?,Do you have family members who have had lung c...,Are there members of your family who have been...,Have you ever had fluid in your lungs?
is_antecedent,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
default_value,0,nulle_part,0,nulle_part,,0,0,0,0,nulle_part,...,0,0,0,0,0,0,0,0,0,0
value_meaning,{},"{'nulle_part': {'fr': 'nulle part', 'en': 'now...",{},"{'nulle_part': {'fr': 'nulle part', 'en': 'now...","{'NA': {'fr': 'NA', 'en': 'NA'}, 'déchirante':...",{},{},{},{},"{'nulle_part': {'fr': 'nulle part', 'en': 'now...",...,{},{},{},{},{},{},{},{},{},{}
possible-values,[],"[nulle_part, aile_iliaque_D_, aile_iliaque_G_,...",[],"[nulle_part, aile_iliaque_D_, aile_iliaque_G_,...","[NA, déchirante, lancinante_/_choc_électrique,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",[],"[nulle_part, aile_iliaque_D_, aile_iliaque_G_,...",...,[],[],[],[],[],[],[],[],[],[]
data_type,B,M,B,M,M,C,C,C,B,M,...,B,B,B,B,B,B,B,B,B,B


In [5]:
# Let's explore the list of conditions
release_conditions_df.head(7)

Unnamed: 0,Pneumothorax spontané,Céphalée en grappe,Syndrome de Boerhaave,Fracture de côte spontanée,RGO,VIH (Primo-infection),Anémie,Pharyngite virale,Hernie inguinale,Myasthénie grave,...,Pneumonie,Rhinosinusite aigue,Rhinosinusite chronique,Bronchiolite,néoplasie pulmonaire,Possible NSTEMI / STEMI,Sarcoïdose,Néoplasie du pancréas,OAP/Surcharge pulmonaire,Péricardite
condition_name,Pneumothorax spontané,Céphalée en grappe,Syndrome de Boerhaave,Fracture de côte spontanée,RGO,VIH (Primo-infection),Anémie,Pharyngite virale,Hernie inguinale,Myasthénie grave,...,Pneumonie,Rhinosinusite aigue,Rhinosinusite chronique,Bronchiolite,néoplasie pulmonaire,Possible NSTEMI / STEMI,Sarcoïdose,Néoplasie du pancréas,OAP/Surcharge pulmonaire,Péricardite
cond-name-fr,Pneumothorax spontané,Céphalée en grappe,Syndrome de Boerhaave,Fracture de côte spontanée,RGO,VIH (Primo-infection),Anémie,Pharyngite virale,Hernie inguinale,Myasthénie grave,...,Pneumonie,Rhinosinusite aigue,Rhinosinusite chronique,Bronchiolite,néoplasie pulmonaire,Possible NSTEMI / STEMI,Sarcoïdose,Néoplasie du pancréas,OAP/Surcharge pulmonaire,Péricardite
cond-name-eng,Spontaneous pneumothorax,Cluster headache,Boerhaave,Spontaneous rib fracture,GERD,HIV (initial infection),Anemia,Viral pharyngitis,Inguinal hernia,Myasthenia gravis,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
icd10-id,J93,g44.009,K22.3,S22.9,K21,B20,D64.9,J02.9,K40,G70.0,...,"j17, j18",j01,j32,j21,c34,I21,d86,c25,J81.0,I30
symptoms,"{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'pyrosis': {}, 'toux': {}, 'ww_bouffe': {}, '...","{'fievre': {}, 'fatig_ext': {}, 'msk_dlr': {},...","{'etourdissement': {}, 'fatig_mod': {}, 'fatig...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'dysphagie': {}, 'dysarthrie': {}, 'diplopie'...",...,"{'ww_respi': {}, 'douleurxx_endroitducorps': {...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'toux': {}, 'dyspn': {}, 'boire_ped': {}, 'rh...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'perte_appet': {}, 'douleurxx_endroitducorps'...","{'lesions_peau_endroitducorps': {}, 'lesions_p...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'ww_effort': {}, 'douleurxx_endroitducorps': ...","{'ww_respi': {}, 'ww_dd': {}, 'douleurxx_endro..."
antecedents,"{'f17.210': {}, 'pneumothorax': {}, 'ap_pneumo...","{'atcd_cluster': {}, 'f10.129': {}, 'rx_vasodi...","{'f10.129': {}, 'trav1': {}}","{'f10.129': {}, 'cancer_méta': {}, 'osteoporos...","{'e66': {}, 'preg1': {}, 'j45': {}, 'f10.129':...","{'itss_risque': {}, 'drogues_IV': {}, 'atcd_it...","{'Mauv_aliment': {}, 'atcd_anem': {}, 'atcd_fa...","{'dayc': {}, 'crowd': {}, 'f17.210': {}, 'cont...","{'perinatality': {}, 'e66': {}, 'trav1': {}}","{'atcdfam_mg': {}, 'trav1': {}}",...,"{'vaccination': {}, 'surg1': {}, 'j44_j42': {}...","{'vaccination': {}, 'j34.2': {}, 'f17.210': {}...","{'j06.9': {}, 'j34.2': {}, 'f17.210': {}, 'k21...","{'vaccination': {}, 'momasthma': {}, 'crowd': ...","{'smokingpast': {}, 'f17.210': {}, 'z80.1': {}...","{'i25.1': {}, 'i73.9': {}, 'smokingpast': {}, ...","{'e66': {}, 'tagri': {}, 'trav1': {}}","{'e10_e11': {}, 'f17.210': {}, 'e66': {}, 'z80...","{'i25.1': {}, 'i50': {}, 'J81': {}, 'i10': {},...","{'B34.9': {}, 'I30': {}, 'trav1': {}}"
severity,2,3,2,3,3,3,4,4,3,3,...,3,4,5,3,3,1,4,3,1,4


In [6]:
# Let's explore the training dataset
training_df.head()

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",M,IVRS ou virémie,"['crowd', 'diaph', 'douleurxx', 'douleurxx_car...",fievre
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",M,VIH (Primo-infection),"['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...",diaph
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",F,Pneumonie,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",expecto
3,34,"[['IVRS ou virémie', 0.23859396799565236], ['C...",F,IVRS ou virémie,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx
4,36,"[['IVRS ou virémie', 0.23677812769175735], ['P...",M,IVRS ou virémie,"['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...",toux


In [7]:
# let's see what's the longest list of evidences caracteristics
max_len = 0
for evidence in release_evidences_df.columns:
    if release_evidences_df[evidence]['possible-values'] is not None: # index 7 is the index of the list of evidences caracteristics
        if len(release_evidences_df[evidence]['possible-values']) > max_len:
            max_len = len(release_evidences_df[evidence]['possible-values'])
print('max_len: ', max_len)

max_len:  165


In [8]:
training_df['EVIDENCES'][0] # check the first evidence list in the training dataset

"['crowd', 'diaph', 'douleurxx', 'douleurxx_carac_@_sensible', 'douleurxx_carac_@_une_lourdeur_ou_serrement', 'douleurxx_endroitducorps_@_front', 'douleurxx_endroitducorps_@_joue_D_', 'douleurxx_endroitducorps_@_tempe_G_', 'douleurxx_intens_@_4', 'douleurxx_irrad_@_nulle_part', 'douleurxx_precis_@_3', 'douleurxx_soudain_@_3', 'expecto', 'f17.210', 'fievre', 'gorge_dlr', 'toux', 'trav1_@_N', 'z77.22']"

In [9]:
# get the index of the selected caracteristic of the selected evidence of the selected patient
def get_index_of_selected_caracteristic(patient_id, evidence_id):
    # get the evidence of the selected patient
    patient_evidence = ast.literal_eval(str(training_df['EVIDENCES'][patient_id]))
    # get the selected evidence
    selected_evidence = patient_evidence[evidence_id]
    # The caracteristic is separated from the value '_@_' in the selected evidence
    # get the selected evidence name
    selected_evidence_name = selected_evidence.split('_@_')[0] # the name is the first element
    # get the selected caracteristic
    selected_caracteristic = selected_evidence.split('_@_')[1] # the caracteristic is the second element
    # get the index of the selected caracteristic
    index_of_selected_caracteristic = release_evidences_df[selected_evidence_name]['possible-values'].index(selected_caracteristic) # the index of the caracteristic
    return index_of_selected_caracteristic

In [10]:
print(get_index_of_selected_caracteristic(0, 5))

74


In [11]:
# re read the training dataset
training_df = pd.read_csv('..\\Datasets\\Patient-disease-symptom\\release_train_patients.csv')

In [12]:
# The EVIENCES column contains a list of evidences & caracteristics when the evidence is not binary
# We need to convert this list to columns in the training dataset
# When the evidence is not binary, we need to add a column for each caracteristic
def convert_evidences_to_columns(dataframe):
    # loop through the evidences columns
    # for each evidence, we need to add a column for each caracteristic if the evidence is not binary (else we add only one column)
    evidences_columns = []
    for evidence in release_evidences_df.columns:
        if release_evidences_df[evidence]['data_type'] != 'B':
            for caracteristic in release_evidences_df[evidence]['possible-values']:
                evidences_columns.append(evidence + '_@_' + str(caracteristic))
        else:
            evidences_columns.append(evidence)
    # for performance reasons, we need to divide the columns into 3 lists so we can concatenate them later
    evidences_columns_1 = evidences_columns[:int(len(evidences_columns)/3)]
    evidences_columns_2 = evidences_columns[int(len(evidences_columns)/3):int(len(evidences_columns)*2/3)]
    evidences_columns_3 = evidences_columns[int(len(evidences_columns)*2/3):]
    print('evidences_columns_1: ', evidences_columns_1)
    print('len(evidences_columns_1): ', len(evidences_columns_1))
    print('evidences_columns_2: ', evidences_columns_2)
    print('len(evidences_columns_2): ', len(evidences_columns_2))
    print('evidences_columns_3: ', evidences_columns_3)
    print('len(evidences_columns_3): ', len(evidences_columns_3))
    print('len(evidences_columns): ', len(evidences_columns))
    # dataframe = pd.concat([dataframe, pd.DataFrame(columns=evidences_columns)])
    # fill the new dataframe with the values of the original dataframe
    # for patient_id in range(len(dataframe)):
    #     # get the evidence list of the selected patient
    #     patient_evidence = ast.literal_eval(str(dataframe['EVIDENCES'][patient_id]))
    #     # loop through the evidence list (this list contains the evidences with the same names as the columns of the dataframe)
    #     # so we just need to change the values of the columns to 1 if the evidence is present
    #     for evidence in patient_evidence:
    #         dataframe[evidence][patient_id] = 1

In [13]:
convert_evidences_to_columns(training_df)

evidences_columns_1:  ['fievre', 'douleurxx_endroitducorps_@_nulle_part', 'douleurxx_endroitducorps_@_aile_iliaque_D_', 'douleurxx_endroitducorps_@_aile_iliaque_G_', 'douleurxx_endroitducorps_@_aine_D_', 'douleurxx_endroitducorps_@_aine_G_', 'douleurxx_endroitducorps_@_aisselle_D_', 'douleurxx_endroitducorps_@_aisselle_G_', 'douleurxx_endroitducorps_@_amygdale_D_', 'douleurxx_endroitducorps_@_amygdale_G_', 'douleurxx_endroitducorps_@_anus', 'douleurxx_endroitducorps_@_arrière_de_la_cheville_D_', 'douleurxx_endroitducorps_@_arrière_de_la_cheville_G_', 'douleurxx_endroitducorps_@_arrière_de_tête', 'douleurxx_endroitducorps_@_arrière_du_cou', 'douleurxx_endroitducorps_@_avant-bras_D_', 'douleurxx_endroitducorps_@_avant-bras_G_', 'douleurxx_endroitducorps_@_bas_du_thorax', 'douleurxx_endroitducorps_@_biceps_D_', 'douleurxx_endroitducorps_@_biceps_G_', 'douleurxx_endroitducorps_@_bouche', 'douleurxx_endroitducorps_@_cartilage_thyroidien', 'douleurxx_endroitducorps_@_cheville_D_', 'douleurxx

In [14]:
evidences_columns = []
for evidence in release_evidences_df.columns:
    if release_evidences_df[evidence]['data_type'] != 'B':
        for caracteristic in release_evidences_df[evidence]['possible-values']:
            evidences_columns.append(evidence + '_@_' + str(caracteristic))
    else:
        evidences_columns.append(evidence)
# for performance reasons, we need to divide the columns into 3 lists so we can concatenate them later
evidences_columns_1 = evidences_columns[:int(len(evidences_columns)/3)]
evidences_columns_2 = evidences_columns[int(len(evidences_columns)/3):int(len(evidences_columns)*2/3)]
evidences_columns_3 = evidences_columns[int(len(evidences_columns)*2/3):]

In [None]:
# create a new dataframe for each list of columns (for performance reasons) and fill it with 0
training_df_1 = pd.DataFrame(0, index=np.arange(len(training_df)), columns=evidences_columns_1)
# training_df_2 = pd.DataFrame(0, index=np.arange(len(training_df)), columns=evidences_columns_2)
# training_df_3 = pd.DataFrame(0, index=np.arange(len(training_df)), columns=evidences_columns_3)

In [11]:
# Let's extract the questions from the evidence release file and add them to a text file
# the questions are in the 'question_fr' and 'question_en' rows for each evidence (column)
# we need to extract the questions and add them to a text file (one question per line) 

for evidence in release_evidences_df.columns:
    # get the french question
    question_fr = release_evidences_df[evidence]['question_fr']
    # get the english question
    question_en = release_evidences_df[evidence]['question_en']
    # add the french question to the text file
    with open('..\\Datasets\\Patient-disease-symptom\\questions_fr.txt', 'a', encoding='utf-8') as f:
        # write the question and its code (the code is in the code_question column)
        f.write(question_fr + ' ')
        # add the evidence name with its caracteristic (if the evidence is not binary) and also the meaning in french (in the row value meaning)
        if release_evidences_df[evidence]['data_type'] != 'B':
            for caracteristic in release_evidences_df[evidence]['possible-values']:
                #meaning = release_evidences_df[evidence]['value_meaning'][caracteristic]['fr']
                f.write(evidence + '_@_' + str(caracteristic) + ', ')
        else:
            f.write(evidence)
        f.write('\n')
    # add the english question to the text file
    with open('..\\Datasets\\Patient-disease-symptom\\questions_en.txt', 'a', encoding='utf-8') as f:
        f.write(question_en + ' ')
        # add the evidence name with its caracteristic (if the evidence is not binary) and also the meaning in english (in the row value meaning)
        if release_evidences_df[evidence]['data_type'] != 'B':
            for caracteristic in release_evidences_df[evidence]['possible-values']:
                #meaning = release_evidences_df[evidence]['value_meaning'][caracteristic]['en']
                f.write(evidence + '_@_' + str(caracteristic) + ', ')
        else:
            f.write(evidence)
        f.write('\n')

In [4]:
evidences_columns = []
for evidence in release_evidences_df.columns:
    if release_evidences_df[evidence]['data_type'] != 'B':
        for caracteristic in release_evidences_df[evidence]['possible-values']:
            to_add = evidence + '_@_' + str(caracteristic)
            if to_add not in evidences_columns:
                evidences_columns.append(to_add)
    else:
        if evidence not in evidences_columns:
            evidences_columns.append(evidence)

In [5]:
print('evidences_columns: ', evidences_columns)
print('len(evidences_columns): ', len(evidences_columns))

evidences_columns:  ['fievre', 'douleurxx_endroitducorps_@_nulle_part', 'douleurxx_endroitducorps_@_aile_iliaque_D_', 'douleurxx_endroitducorps_@_aile_iliaque_G_', 'douleurxx_endroitducorps_@_aine_D_', 'douleurxx_endroitducorps_@_aine_G_', 'douleurxx_endroitducorps_@_aisselle_D_', 'douleurxx_endroitducorps_@_aisselle_G_', 'douleurxx_endroitducorps_@_amygdale_D_', 'douleurxx_endroitducorps_@_amygdale_G_', 'douleurxx_endroitducorps_@_anus', 'douleurxx_endroitducorps_@_arrière_de_la_cheville_D_', 'douleurxx_endroitducorps_@_arrière_de_la_cheville_G_', 'douleurxx_endroitducorps_@_arrière_de_tête', 'douleurxx_endroitducorps_@_arrière_du_cou', 'douleurxx_endroitducorps_@_avant-bras_D_', 'douleurxx_endroitducorps_@_avant-bras_G_', 'douleurxx_endroitducorps_@_bas_du_thorax', 'douleurxx_endroitducorps_@_biceps_D_', 'douleurxx_endroitducorps_@_biceps_G_', 'douleurxx_endroitducorps_@_bouche', 'douleurxx_endroitducorps_@_cartilage_thyroidien', 'douleurxx_endroitducorps_@_cheville_D_', 'douleurxx_e