# <center>Utility based filtering - Recommendation system</center>

The goal is to create a dataset with diseases and their symptoms using the descriptions of pathologies (json file)

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MultiLabelBinarizer

## Loading dataset using Dask

In [2]:
# load the dataset
# json files
release_evidences_df = pd.read_json('..\\Datasets\\Patient-disease-symptom\\release_evidences.json')
release_conditions_df = pd.read_json('..\\Datasets\\Patient-disease-symptom\\release_conditions.json')

# Explore data

In [3]:
release_conditions_df.head(7)

Unnamed: 0,Pneumothorax spontané,Céphalée en grappe,Syndrome de Boerhaave,Fracture de côte spontanée,RGO,VIH (Primo-infection),Anémie,Pharyngite virale,Hernie inguinale,Myasthénie grave,...,Pneumonie,Rhinosinusite aigue,Rhinosinusite chronique,Bronchiolite,néoplasie pulmonaire,Possible NSTEMI / STEMI,Sarcoïdose,Néoplasie du pancréas,OAP/Surcharge pulmonaire,Péricardite
condition_name,Pneumothorax spontané,Céphalée en grappe,Syndrome de Boerhaave,Fracture de côte spontanée,RGO,VIH (Primo-infection),Anémie,Pharyngite virale,Hernie inguinale,Myasthénie grave,...,Pneumonie,Rhinosinusite aigue,Rhinosinusite chronique,Bronchiolite,néoplasie pulmonaire,Possible NSTEMI / STEMI,Sarcoïdose,Néoplasie du pancréas,OAP/Surcharge pulmonaire,Péricardite
cond-name-fr,Pneumothorax spontané,Céphalée en grappe,Syndrome de Boerhaave,Fracture de côte spontanée,RGO,VIH (Primo-infection),Anémie,Pharyngite virale,Hernie inguinale,Myasthénie grave,...,Pneumonie,Rhinosinusite aigue,Rhinosinusite chronique,Bronchiolite,néoplasie pulmonaire,Possible NSTEMI / STEMI,Sarcoïdose,Néoplasie du pancréas,OAP/Surcharge pulmonaire,Péricardite
cond-name-eng,Spontaneous pneumothorax,Cluster headache,Boerhaave,Spontaneous rib fracture,GERD,HIV (initial infection),Anemia,Viral pharyngitis,Inguinal hernia,Myasthenia gravis,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
icd10-id,J93,g44.009,K22.3,S22.9,K21,B20,D64.9,J02.9,K40,G70.0,...,"j17, j18",j01,j32,j21,c34,I21,d86,c25,J81.0,I30
symptoms,"{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'pyrosis': {}, 'toux': {}, 'ww_bouffe': {}, '...","{'fievre': {}, 'fatig_ext': {}, 'msk_dlr': {},...","{'etourdissement': {}, 'fatig_mod': {}, 'fatig...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'dysphagie': {}, 'dysarthrie': {}, 'diplopie'...",...,"{'ww_respi': {}, 'douleurxx_endroitducorps': {...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'toux': {}, 'dyspn': {}, 'boire_ped': {}, 'rh...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'perte_appet': {}, 'douleurxx_endroitducorps'...","{'lesions_peau_endroitducorps': {}, 'lesions_p...","{'douleurxx_endroitducorps': {}, 'douleurxx': ...","{'ww_effort': {}, 'douleurxx_endroitducorps': ...","{'ww_respi': {}, 'ww_dd': {}, 'douleurxx_endro..."
antecedents,"{'f17.210': {}, 'pneumothorax': {}, 'ap_pneumo...","{'atcd_cluster': {}, 'f10.129': {}, 'rx_vasodi...","{'f10.129': {}, 'trav1': {}}","{'f10.129': {}, 'cancer_méta': {}, 'osteoporos...","{'e66': {}, 'preg1': {}, 'j45': {}, 'f10.129':...","{'itss_risque': {}, 'drogues_IV': {}, 'atcd_it...","{'Mauv_aliment': {}, 'atcd_anem': {}, 'atcd_fa...","{'dayc': {}, 'crowd': {}, 'f17.210': {}, 'cont...","{'perinatality': {}, 'e66': {}, 'trav1': {}}","{'atcdfam_mg': {}, 'trav1': {}}",...,"{'vaccination': {}, 'surg1': {}, 'j44_j42': {}...","{'vaccination': {}, 'j34.2': {}, 'f17.210': {}...","{'j06.9': {}, 'j34.2': {}, 'f17.210': {}, 'k21...","{'vaccination': {}, 'momasthma': {}, 'crowd': ...","{'smokingpast': {}, 'f17.210': {}, 'z80.1': {}...","{'i25.1': {}, 'i73.9': {}, 'smokingpast': {}, ...","{'e66': {}, 'tagri': {}, 'trav1': {}}","{'e10_e11': {}, 'f17.210': {}, 'e66': {}, 'z80...","{'i25.1': {}, 'i50': {}, 'J81': {}, 'i10': {},...","{'B34.9': {}, 'I30': {}, 'trav1': {}}"
severity,2,3,2,3,3,3,4,4,3,3,...,3,4,5,3,3,1,4,3,1,4


In [11]:
# Example of symptoms for a condition
release_conditions_df['VIH (Primo-infection)']['symptoms']

{'fievre': {},
 'fatig_ext': {},
 'msk_dlr': {},
 'perte_poids': {},
 'douleurxx_endroitducorps': {},
 'douleurxx': {},
 'douleurxx_irrad': {},
 'douleurxx_carac': {},
 'douleurxx_soudain': {},
 'douleurxx_intens': {},
 'douleurxx_precis': {},
 'gorge_dlr': {},
 'adp_dlr': {},
 'diarrhee': {},
 'lesions_peau_endroitducorps': {},
 'lesions_peau': {},
 'lesions_peau_couleur': {},
 'lesions_peau_intens': {},
 'lesions_peau_elevee': {},
 'lesions_peau_prurit': {},
 'lesions_peau_plusqu1cm': {},
 'lesions_peau_desquame': {},
 'nausee': {},
 'diaph': {}}

In [10]:
# Example of antecedents for a condition
release_conditions_df['VIH (Primo-infection)']['antecedents']

{'itss_risque': {},
 'drogues_IV': {},
 'atcd_its': {},
 'sex_vih': {},
 'trav1': {}}

# Create new dataset

## Create initial dataframe

In [15]:
# Let's create a dataframe with the evidences (symptoms and antecedents) for each condition
conditions_df = pd.DataFrame(columns=['condition', 'evidences'])
for condition in release_conditions_df:
    # the symptoms and antecedents are dictionaries with the keys being the symptoms/antecedents
    # we need to convert them to lists
    symptoms = list(release_conditions_df[condition]['symptoms'].keys())
    antecedents = list(release_conditions_df[condition]['antecedents'].keys())
    evidences = symptoms + antecedents
    conditions_df = pd.concat([conditions_df, pd.DataFrame([[condition, evidences]], columns=['condition', 'evidences'])], ignore_index=True)

conditions_df.head()

Unnamed: 0,condition,evidences
0,Pneumothorax spontané,"[douleurxx_endroitducorps, douleurxx, douleurx..."
1,Céphalée en grappe,"[douleurxx_endroitducorps, douleurxx, douleurx..."
2,Syndrome de Boerhaave,"[douleurxx_endroitducorps, douleurxx, douleurx..."
3,Fracture de côte spontanée,"[douleurxx_endroitducorps, douleurxx, douleurx..."
4,RGO,"[pyrosis, toux, ww_bouffe, douleurxx_endroitdu..."


The evidences now is a column containing the list of the columns. Let's first check how many unique evidences are there. And then let's make those values columns with values 0 and 1 depending on their existence

## Verifying unique evidences

In [16]:
# Let's check the unique evidences in the dataset we created
evidences = set()
for evidence in conditions_df['evidences']:
    evidences.update(evidence)
print('Number of unique evidences: {}'.format(len(evidences)))
print('Unique evidences: {}'.format(evidences))

Number of unique evidences: 223
Unique evidences: {'rds_paralys_gen', 'osteoporose', 'lesions_peau_intens', 'c00-d48', 'ap_asian', 'faible', 'protu_langue', 'Mauv_aliment', 'antipsy_récent', 'pertes_vag', 'faiblesse faciale', 'z80.1', 'melena', 'oedeme', 'trismus', 'trav1', 'z77.22', 'footnumb', 'perte_poids', 'atcdpsyfam', 'smokingpast', 'bw_bending', 'ains', 'lesions_peau_endroitducorps', 'selles_pale', 'naco', 'z82.49', 'vaccination', 'douleurxx_irrad', 'nau_psy_recent', 'contact', 'irritable', 'paralysie_visage', 'dyspn_noct', 'hosptisasm', 'f10.129', 'j06.9', 'palpit', 'boire_ped', 'pdc', 'douleurxx_soudain', 'obstipation', 'hyponos', 'allergie_sev', 'diarrhee', 'toux_Aboy', 'atcd_cluster', 'm79.7', 's09.90', 'H6690', 'apnee', 'B34.9', 'dysphagie', 'immob1', 'rds_sg', 'f17.210', 'ww_bouger', 'sex_vih', 'toux', 'fatig_mod', 'cortico', 'f32', 'dysarthrie', 'histfammigraine', 'tmine', 'suburb', 'lesions_peau_elevee', 'rectorragie', 'dayc', 'HIV', 'sialorhee', 'fam_j45', 'irc', 'synd_

In [19]:
# Let's read the evidences from the file (the evidences used on the original dataset)
evidences_txt = []
with open('..\\Datasets\\Patient-disease-symptom\\evidences.txt', 'r', encoding='utf-8') as f:
    # the evidences are stored as a list in the file, so we need to convert it to a list
    evidences_txt = ast.literal_eval(f.read())

# let's keep only the evidences names (the evidences are stored in the format: 'evidences_name_@_evidences_type')
old_evidences = set()
for evidence in evidences_txt:
    old_evidences.update([evidence.split('_@_')[0]])

print('Number of unique evidences: {}'.format(len(old_evidences)))
print('Unique evidences: {}'.format(old_evidences))

Number of unique evidences: 223
Unique evidences: {'c00-d48', 'lesions_peau_intens', 'osteoporose', 'rds_paralys_gen', 'ap_asian', 'faible', 'protu_langue', 'Mauv_aliment', 'antipsy_récent', 'pertes_vag', 'faiblesse faciale', 'z80.1', 'melena', 'oedeme', 'trismus', 'trav1', 'z77.22', 'footnumb', 'perte_poids', 'atcdpsyfam', 'smokingpast', 'bw_bending', 'ains', 'lesions_peau_endroitducorps', 'selles_pale', 'naco', 'z82.49', 'vaccination', 'douleurxx_irrad', 'nau_psy_recent', 'contact', 'irritable', 'paralysie_visage', 'dyspn_noct', 'hosptisasm', 'f10.129', 'j06.9', 'palpit', 'boire_ped', 'pdc', 'douleurxx_soudain', 'obstipation', 'hyponos', 'allergie_sev', 'diarrhee', 'toux_Aboy', 'atcd_cluster', 'm79.7', 's09.90', 'apnee', 'H6690', 'B34.9', 'dysphagie', 'immob1', 'rds_sg', 'f17.210', 'ww_bouger', 'sex_vih', 'fatig_mod', 'toux', 'cortico', 'f32', 'dysarthrie', 'histfammigraine', 'suburb', 'tmine', 'lesions_peau_elevee', 'dayc', 'HIV', 'rectorragie', 'fam_j45', 'sialorhee', 'irc', 'J81',

In [20]:
# Let's check if =the evidences in the dataset exist in the evidences.txt file
new_evidences = []
for evidence in evidences:
    if evidence not in old_evidences:
        new_evidences.append(evidence)

print('Number of evidences not in the evidences.txt file: {}'.format(len(new_evidences)))

Number of evidences not in the evidences.txt file: 0


Luckily everything is good. The evidences are the same in both datasets

## Transforming dataset

In [21]:
# Let's change the evidences lists in the df to columns with binary values
mlb = MultiLabelBinarizer()
conditions_df = conditions_df.join(pd.DataFrame(mlb.fit_transform(conditions_df.pop('evidences')), columns=mlb.classes_, index=conditions_df.index))
conditions_df.head()

Unnamed: 0,condition,B34.9,H6690,HIV,I30,J05.0,J81,K86.1,Mauv_aliment,Z99.2,...,ww_effort,ww_nuit,ww_respi,ww_valsalva,z77.22,z80.0,z80.1,z82.49,z84.89,z92.25
0,Pneumothorax spontané,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,Céphalée en grappe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Syndrome de Boerhaave,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fracture de côte spontanée,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,RGO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Verify that the evidences are correctly converted to columns (let's check one condition)
check = conditions_df[conditions_df['condition'] == 'VIH (Primo-infection)']

# get all the evidences that are 1 (True)
check = check[check == 1].dropna(axis=1, how='all').columns.tolist()

check

['adp_dlr',
 'atcd_its',
 'diaph',
 'diarrhee',
 'douleurxx',
 'douleurxx_carac',
 'douleurxx_endroitducorps',
 'douleurxx_intens',
 'douleurxx_irrad',
 'douleurxx_precis',
 'douleurxx_soudain',
 'drogues_IV',
 'fatig_ext',
 'fievre',
 'gorge_dlr',
 'itss_risque',
 'lesions_peau',
 'lesions_peau_couleur',
 'lesions_peau_desquame',
 'lesions_peau_elevee',
 'lesions_peau_endroitducorps',
 'lesions_peau_intens',
 'lesions_peau_plusqu1cm',
 'lesions_peau_prurit',
 'msk_dlr',
 'nausee',
 'perte_poids',
 'sex_vih',
 'trav1']

In [31]:
# Let's save the dataframe to a csv file
conditions_df.to_csv('..\\Datasets\\Patient-disease-symptom\\final datasets\\conditions.csv', index=False)

# Model selection

In [59]:
# Let's train some models to recommend conditions based on the evidences
# import necessary libraries
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier

In [33]:
# load the dataset
conditions_df = pd.read_csv('..\\Datasets\\Patient-disease-symptom\\final datasets\\conditions.csv')

In [34]:
# split the dataset into fetures and target
X = conditions_df.drop(['condition'], axis=1)
y = conditions_df['condition']

In [37]:
# train the model directly on the whole dataset (we will not use a test set as we want to recommend conditions based on the evidences)
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)

In [69]:
# SGD
sgd = SGDClassifier(loss='log_loss')
sgd.fit(X, y)

In [62]:
# BernoulliNB
bernoulliNB = BernoulliNB()
bernoulliNB.fit(X, y)

In [64]:
# MLP
mlp = MLPClassifier(max_iter=300)
mlp.fit(X, y)

In [65]:
# Let's test the models
columns = X.columns.tolist()

In [89]:
import random
# random number of symptoms but  not equal to the number of symptoms in the dataset
symptoms_num = random.randint(1, 30)
# random symptoms
symptoms = []
for i in range(symptoms_num):
    symptoms.append(columns[random.randint(0, len(columns)-1)])
# we need to create a dataframe with the same columns as the dataset
symptoms_df = pd.DataFrame(columns=columns)
# we need to add the symptoms to the dataframe
symptoms_df.loc[0] = 0
for symptom in symptoms:
    symptoms_df[symptom] = 1

symptoms

['lesions_peau_desquame',
 'spasmes_msk',
 'diplopie',
 'fatig_mod',
 'drogues_IV',
 'trismus',
 'ww_bouffe',
 'fievre',
 'friss',
 'cont_coq',
 'stridor',
 'perte_poids',
 'lesions_peau_elevee',
 'Z99.2',
 'j06.9',
 'ebolacase',
 'footnumb',
 'momasthma',
 'horm1',
 'spasme_trapeze',
 'psy_depers',
 'contact_allergie']

In [90]:
# list results for all the models
results = pd.DataFrame(columns=['model', 'condition', 'probability'])
for model in [knn, sgd, bernoulliNB, mlp]:
    # predict the condition
    condition = model.predict(symptoms_df)[0]
    # predict the probability of the condition
    probability = max(model.predict_proba(symptoms_df)[0])
    # add the results to the dataframe
    results = pd.concat([results, pd.DataFrame([[model.__class__.__name__, condition, probability]], columns=['model', 'condition', 'probability'])], ignore_index=True)
    
results

Unnamed: 0,model,condition,probability
0,KNeighborsClassifier,Coqueluche,0.2
1,SGDClassifier,Anaphylaxie,0.991431
2,BernoulliNB,Laryngospasme,0.606141
3,MLPClassifier,Laryngospasme,0.282142
