In [34]:
import pandas as pd 
import json
import re

### Level 1: From synonyms to a single symptom

In [35]:
#### Reading long covid symptoms lexicon dataframe
lexicon = pd.read_csv('post_covid19_symptom_lexicon.csv')
lexicon.head()

Unnamed: 0,symptom,consolidated_UMLS_concepts,synonyms,percentage,ranking
0,pain,C0030193:Pain|C0150055:Chronic Pain|C0151825:B...,A26-A29 TYPES OF PAIN|abdominal cramps|abdomin...,0.431,1
1,anxiety,C0003467:Anxiety|C0003469:Anxiety Disorder|C00...,"[""Nerves""] or [nervousness] or [nervous tensio...",0.258,2
2,depression,C0001539:Adjustment Disorder With Depressed Mo...,Reactive depression NOS|9-52 DEPRESSIVE DISOR...,0.24,3
3,fatigue,C0015672:Fatigue|C0015674:Chronic Fatigue Synd...,(non-specific) malaise and fatigue|Akureyri|Ak...,0.234,4
4,joint pain,C0003862:Joint Pain|C0007859:Neck Pain|C001955...,(Cervicalgia) or (neck pain NOS)|Ache in joint...,0.21,5


In [36]:
#### building list of the basic symptoms
basic_symptom_list = lexicon.symptom.to_list()
basic_symptom_list_indices = lexicon.index.to_list()
basic_symptom_list = [symptom.strip() for symptom in basic_symptom_list]

#### fixing the name of a particular symptom : from 'postcoital\xa0bleeding' to 'postcoital bleeding'
basic_symptom_list[315] = 'postcoital bleeding'

In [37]:
consolidated_UMLS_concepts_list = [re.sub(r'C[0-9]*:', '', concept) for concept in lexicon.consolidated_UMLS_concepts.to_list()]
consolidated_UMLS_concepts_list = [s.split('|') for s in consolidated_UMLS_concepts_list]
lexicon['consolidated_UMLS_concepts_list'] = consolidated_UMLS_concepts_list
uml_concepts_exploded_df = lexicon.explode('consolidated_UMLS_concepts_list')
consolidated_UMLS_concepts_list_lower = [concept.lower() for concept in uml_concepts_exploded_df.consolidated_UMLS_concepts_list.to_list()]
consolidated_UMLS_concepts_list_indices = uml_concepts_exploded_df.index.to_list()

In [38]:
synonym_symptoms_list_flat = lexicon.synonyms
for punc in ['|']:
    synonym_symptoms_list = [s.split(punc) for s in synonym_symptoms_list_flat]
    synonym_symptoms_list_flat = [word.lower().strip() for symptom_list in synonym_symptoms_list for word in symptom_list]

lexicon['synonym_symptoms_list'] = synonym_symptoms_list
synonym_symptoms_list_exploded_df = lexicon.explode('synonym_symptoms_list')
synonym_symptoms_list_lower = [synonym.lower() for synonym in synonym_symptoms_list_exploded_df.synonym_symptoms_list.to_list()]
synonym_symptoms_list_indices = synonym_symptoms_list_exploded_df.index.to_list()

In [39]:
#### creating final symptoms synonyms lists
lexicon_symptoms_list = basic_symptom_list + consolidated_UMLS_concepts_list_lower + synonym_symptoms_list_lower
lexicon_symptoms_list_indices = basic_symptom_list_indices + consolidated_UMLS_concepts_list_indices + synonym_symptoms_list_indices

In [40]:
#### creating symptoms dictionary 
#### each symptom synonym is associated with the appropriate indices of the original symptom

lexicon_symptoms_dict = {}
for i, j in zip(lexicon_symptoms_list, lexicon_symptoms_list_indices):
    if i not in lexicon_symptoms_dict.keys():
        lexicon_symptoms_dict[i] = [j]
    else:
        #### for the symptom synonyms that are associated with multiple symptoms
        values_list = []
        if type(lexicon_symptoms_dict[i]) == list:
            values_list.extend(lexicon_symptoms_dict[i])
            values_list.append(j)
            lexicon_symptoms_dict[i] = values_list
        else:
            values_list.append(lexicon_symptoms_dict[i])
            values_list.append(j)
            lexicon_symptoms_dict[i] = values_list
            
lexicon_symptoms_dict = {k: list(set(v)) for (k, v) in lexicon_symptoms_dict.items()}
lexicon_symptoms_dict

{'pain': [0],
 'anxiety': [1],
 'depression': [2],
 'fatigue': [3],
 'joint pain': [4],
 'shortness of breath': [5],
 'headaches': [6],
 'nausea and/or vomiting': [7],
 'myalgia': [8],
 'gastroesophageal reflux': [9],
 'cough': [10],
 'back pain': [11],
 'stress': [12],
 'fever': [13],
 'swelling': [35, 14],
 'bleeding': [15],
 'weight loss': [16],
 'abdominal pain': [17],
 'dizziness or vertigo': [18],
 'chest pain': [19],
 'weakness': [20],
 'constipation': [21],
 'skin lesion': [22],
 'wheezing': [23],
 'rash': [24],
 'insomnia': [25],
 'pain in extremities': [26],
 'paresthesia': [27],
 'peripheral edema': [28],
 'palpitations': [29],
 'diarrhea': [30],
 'itching': [31],
 'erythema': [32],
 'lower urinary tract symptoms': [33],
 'lymphadenopathy': [34],
 'edema': [35],
 'weight gain': [36],
 'sinonasal congestion': [37],
 'pain in throat': [38],
 'abnormal gait': [39],
 'respiratory depression': [40],
 'visual changes': [41],
 'chills': [42],
 'urinary incontinence': [43],
 'sleep 

In [41]:
#### checking the synonyms mapped to multiple symptoms at once to see if we can map them to single symptom
for (k, v) in lexicon_symptoms_dict.items():
    if type(v) == list and len(v)>1:
        print('Symptom synonym: ' + str(k))
        print('Original symptoms mapped to this symptom: ')
        [print(basic_symptom_list[e]) for e in v]

Symptom synonym: swelling
Original symptoms mapped to this symptom: 
edema
swelling
Symptom synonym: abnormal breathing
Original symptoms mapped to this symptom: 
abnormal breathing
shortness of breath
Symptom synonym: cramping
Original symptoms mapped to this symptom: 
muscle cramps
cramping
Symptom synonym: ulcer
Original symptoms mapped to this symptom: 
ulcer
skin lesion
Symptom synonym: spasm
Original symptoms mapped to this symptom: 
spasm
muscle cramps
Symptom synonym: tenderness
Original symptoms mapped to this symptom: 
myalgia
tenderness
Symptom synonym: muscle cramps
Original symptoms mapped to this symptom: 
spasm
muscle cramps
Symptom synonym: loss of consciousness
Original symptoms mapped to this symptom: 
mental status change
loss of consciousness
Symptom synonym: menstrual spotting
Original symptoms mapped to this symptom: 
vaginal bleeding
menstrual spotting
Symptom synonym: restlessness
Original symptoms mapped to this symptom: 
restlessness
psychomotor agitation
agit

In [42]:
#### associating the same words together
#### e.g.: 'swelling' is mapped to edema and swelling, we map it to only swelling
for (k, v) in lexicon_symptoms_dict.items():
    for e in v:
        if lexicon['symptom'][e] == k:
            lexicon_symptoms_dict[k] = [e]
            break

In [43]:
#### checking the synonyms mapped to multiple symptoms at once to see if we can map them to single symptom
for (k, v) in lexicon_symptoms_dict.items():
    if type(v) == list and len(v)>1:
        print('Symptom synonym: ' + str(k))
        print('Original symptoms mapped to this symptom: ')
        [print(basic_symptom_list[e]) for e in v]

Symptom synonym: dull pain
Original symptoms mapped to this symptom: 
pain
chest pain
Symptom synonym: neurotic depression
Original symptoms mapped to this symptom: 
depression
dysthymia
Symptom synonym: break-through bleeding
Original symptoms mapped to this symptom: 
vaginal bleeding
bleeding
Symptom synonym: abdominal cramps
Original symptoms mapped to this symptom: 
pain
abdominal pain
Symptom synonym: numbness
Original symptoms mapped to this symptom: 
paresthesia
hypesthesia
Symptom synonym: decreased urine output
Original symptoms mapped to this symptom: 
lower urinary tract symptoms
oliguria
Symptom synonym: respiratory arrest
Original symptoms mapped to this symptom: 
respiratory depression
sleep apnea
Symptom synonym: rigor
Original symptoms mapped to this symptom: 
chills
muscle stiffness
Symptom synonym: confusional state
Original symptoms mapped to this symptom: 
mental status change
confusion
Symptom synonym: collapse
Original symptoms mapped to this symptom: 
anaphylaxis

In [44]:
lexicon['symptom'][280]

'throat swelling'

In [45]:
#### for the remaining synonyms we keep the closest associated symptom in meaning to avoid confusion

lexicon_symptoms_dict['dull pain'] = [0] #pain instead of chest pain
lexicon_symptoms_dict['neurotic depression'] = [2] #depression instead of dysthymia
lexicon_symptoms_dict['break-through bleeding'] = [93] #vaginal bleeding instead of bleeding
lexicon_symptoms_dict['abdominal cramps'] = [17] #abdominal pain instead of pain
lexicon_symptoms_dict['numbness'] = [165] #hypesthesia instead of paresthesia
lexicon_symptoms_dict['decreased urine output'] = [301] #oliguria instad of lower urinary tract symptoms
lexicon_symptoms_dict['respiratory arrest'] = [40] #respiratory depression instead of sleep apnea
lexicon_symptoms_dict['rigor'] = [42] #chills instead of muscle stiffness
lexicon_symptoms_dict['confusional state'] = [45] #confusion instead of mental status change
lexicon_symptoms_dict['collapse'] = [52] #syncope instead of anaphylaxis
lexicon_symptoms_dict['hematochezia'] = [64] #gastrointestinal hemorrhage instead of blood in stools
lexicon_symptoms_dict['melena'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['sinus pain'] = [79] #facial pain instead of headaches
lexicon_symptoms_dict['ulceration'] = [22] #skin lesion instead of ulcer 
lexicon_symptoms_dict['myofascial pain'] = [8] #myalgia instead of musculoskeletal pain
lexicon_symptoms_dict['blood in stool'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['dysphonia'] = [119] #hoarseness instead of difficulty speaking 
lexicon_symptoms_dict['motor restlessness'] = [305] #akathisia instead of restlessness
lexicon_symptoms_dict['crampy abdominal pain'] = [17] #abdominal pain instead of pain
lexicon_symptoms_dict['oral pain'] = [0] #pain instead of facial pain
lexicon_symptoms_dict['depressive neurosis'] = [2] #depression instead of dysthymia
lexicon_symptoms_dict['feel bad'] = [205] #feeling bad instead of 
lexicon_symptoms_dict['feel ill'] = [205] #feeling bad instead of fatigue
lexicon_symptoms_dict['breathing difficulties'] = [5] #shortness of breath instead of respiratory depression
lexicon_symptoms_dict['difficulty breathing'] = [5] #shortness of breath instead of respiratory depression
lexicon_symptoms_dict['respiratory difficulties'] = [5] #shortness of breath instead of respiratory depression
lexicon_symptoms_dict['vomiting blood'] = [64] #gastrointestinal hemorrhage instead of nausea and/or vomiting
lexicon_symptoms_dict['facial puffiness'] = [198] #facial edema instead of swelling
lexicon_symptoms_dict['bleeding breakthrough'] = [93] #vaginal bleeding instead of bleeding 
lexicon_symptoms_dict['bleeding diathesis'] = [156] #tendency to bleed instead of bleeding
lexicon_symptoms_dict['bleeding disorder'] = [156] #tendency to bleed instead of bleeding
lexicon_symptoms_dict['bleeding disorders'] = [156] #tendency to bleed instead of bleeding
lexicon_symptoms_dict['bleeding tendency'] = [156] #tendency to bleed instead of bleeding
lexicon_symptoms_dict['breakthrough bleeding'] = [93] #vaginal bleeding instead of bleeding
lexicon_symptoms_dict['intermenstrual bleeding'] = [93] #vaginal bleeding instead of bleeding
lexicon_symptoms_dict['spotting between menses'] = [93] #vaginal bleeding instead of bleeding
lexicon_symptoms_dict['wooziness'] = [18] #dizziness or vertigo instead of clouded consciousness
lexicon_symptoms_dict['paresis'] = [116] #muscle weakness instead of weakness
lexicon_symptoms_dict['peeling'] = [22] #skin lesion instead of skin irritation
lexicon_symptoms_dict['skin breakdown'] = [173] #impaired skin integrity instead of skin lesion
lexicon_symptoms_dict['spots'] = [22] #skin lesion instead of rash
lexicon_symptoms_dict['ulcerated'] = [81] #ulcer instead of skin lesion
lexicon_symptoms_dict['ulcerating'] = [81] #ulcer instead of skin lesion
lexicon_symptoms_dict['ulceration, nos'] = [81] #ulcer instead of skin lesion
lexicon_symptoms_dict['disturbances, sleep'] = [57] #sleep disorder instead of insomnia
lexicon_symptoms_dict['sleep disturbance'] = [57] #sleep disorder instead of insomnia
lexicon_symptoms_dict['sleep disturbances'] = [57] #sleep disorder instead of insomnia
lexicon_symptoms_dict['acquired lymphedema'] = [249] #lymphedema instead of peripheral edema
lexicon_symptoms_dict['acquired lymphoedema'] = [249] #lymphedema instead of peripheral edema
lexicon_symptoms_dict['urine output decreased'] = [301] #oliguria instead of lower urinary tract symptoms
lexicon_symptoms_dict['urine output low'] = [301] #oliguria instead of lower urinary tract symptoms
lexicon_symptoms_dict['urine production scanty'] = [301]  #oliguria instead of lower urinary tract symptoms
lexicon_symptoms_dict['urine volume deficient'] = [301] #oliguria instead of lower urinary tract symptoms
lexicon_symptoms_dict['volume urine decreased'] = [301] #oliguria instead of lower urinary tract symptoms
lexicon_symptoms_dict['excess fluid'] = [109] #fluid retention instead of edema
lexicon_symptoms_dict['blocked nose'] = [223] #nasal obstruction insetad of sinonasal congestion
lexicon_symptoms_dict['difficulties gait'] = [39] #abnormal gait instead of walking disability
lexicon_symptoms_dict['gait difficulty'] = [39] #abnormal gait instead of walking disability
lexicon_symptoms_dict['limited mobility'] = [179] #reduced mobility instead of abnormal gait
lexicon_symptoms_dict['pulmonary arrest'] = [40] #respiratory depression instead of sleep apnea
lexicon_symptoms_dict['loss of vision'] = [129] #blindness instead of visual changes
lexicon_symptoms_dict['vision loss'] = [129] #blindness instead of visual changes
lexicon_symptoms_dict['acute confusional state'] = [45] #confusion instead of mental status change
lexicon_symptoms_dict['bewilderment'] = [45] #confusion instead of clouded consciousness
lexicon_symptoms_dict['dazed'] = [45] #confusion instead of clouded consciousness
lexicon_symptoms_dict['dazed state'] = [45] #confusion instead of clouded consciousness
lexicon_symptoms_dict['muddled'] = [45] #confusion instead of clouded consciousness
lexicon_symptoms_dict['blackout'] = [107] #loss of consciousness instead of syncope
lexicon_symptoms_dict['pass out'] = [107]  #loss of consciousness instead of syncope
lexicon_symptoms_dict['passed out'] = [107]  #loss of consciousness instead of syncope
lexicon_symptoms_dict['passing out'] = [107]  #loss of consciousness instead of syncope
lexicon_symptoms_dict['black faeces'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['black faeces symptom'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['black feces'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['black feces symptom'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['black stool'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['black stools'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['blood in faeces'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['blood in feces'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['bloody stool'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['dark stools'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['faeces: blood'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['feces: blood'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['haematochezia'] = [64] #gastrointestinal hemorrhage instead of blood in stools
lexicon_symptoms_dict['melaena'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['passage of bloody stools'] = [99] #blood in stools instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['stool black'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['stool tarry'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['tarry stool'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['tarry stools'] = [181] #feces color: tarry instead of gastrointestinal hemorrhage
lexicon_symptoms_dict['excessive overactivity'] = [158] #psychomotor agitation instead of agitation
lexicon_symptoms_dict['excessive overactivity, nos'] = [158] #psychomotor agitation instead of agitation
lexicon_symptoms_dict['increased purposeless goalless activity'] = [158] #psychomotor agitation instead of agitation
lexicon_symptoms_dict['increased purposeless goalless activity, nos'] = [158] #psychomotor agitation instead of agitation
lexicon_symptoms_dict['restless'] = [128] #restlessness instead of agitation
lexicon_symptoms_dict['restlessness marked'] = [128] #restlessness instead of agitation
lexicon_symptoms_dict['unable to keep still'] = [128] #restlessness instead of agitation
lexicon_symptoms_dict['hair thinning'] = [75] #hair loss instead of hypotrichosis
lexicon_symptoms_dict['affective psychosis nos'] = [145] #psychosis instead of mood swings
lexicon_symptoms_dict['muscle spasm'] = [88] #spasm instead of myotonia
lexicon_symptoms_dict['dysfunctional uterine bleeding'] = [139] #abnormal uterine bleeding instead of vaginal bleeding
lexicon_symptoms_dict['spotting'] = [127] #menstrual spotting instead of vaginal bleeding
lexicon_symptoms_dict['cold feel'] = [278] #feels cold instead of temperature intolerance
lexicon_symptoms_dict['cold feelings'] = [278] #feels cold instead of temperature intolerance
lexicon_symptoms_dict['feel cold'] = [278] #feels cold instead of temperature intolerance
lexicon_symptoms_dict['jerk'] = [243] #twitching instead of muscle twitching
lexicon_symptoms_dict['jerking'] = [243] #twitching instead of muscle twitching
lexicon_symptoms_dict['manic'] = [207] #manic mood instead of mania
lexicon_symptoms_dict['swollen throat'] = [280] #throat swelling instead of oropharyngeal swelling

In [46]:
lexicon_symptoms_dict_numbered = {k:v[0] for (k, v) in lexicon_symptoms_dict.items()}
lexicon_symptoms_dict_named = {k:basic_symptom_list[v] for (k, v) in lexicon_symptoms_dict_numbered.items()}

In [47]:
lexicon_symptoms_json_numbered = json.dumps(lexicon_symptoms_dict_numbered)
with open("lexicon_symptoms_dict_numbered.json","w") as f:
    f.write(lexicon_symptoms_json_numbered)

lexicon_symptoms_json_named = json.dumps(lexicon_symptoms_dict_named)
with open("lexicon_symptoms_dict_named.json","w") as f:
    f.write(lexicon_symptoms_json_named)

### Level 2: From single symptom to the category of the symptom

In [48]:
# Reading the categorisation of the lexicon symptoms
symptom_categorisation = pd.read_csv('Symptoms Categorisation.csv')

In [49]:
symptom_categorisation

Unnamed: 0,Category,Symptoms
0,Other Symptoms,"sexual dysfunction, priapism, dyspareunia, lib..."
1,General,"fatigue, fever, bleeding, weight loss, weaknes..."
2,Body Pain/Mobility,"pain, joint pain, myalgia, back pain, pain in ..."
3,Mental Health/Psychological/Behavioral,"stress, anxiety, depression, suffering, mental..."
4,Sleep,"insomnia, sleep apnea, sleep disorder, somnole..."
5,Otorhinolaryngology,"sinonasal congestion, nasal symptoms, pain in ..."
6,Neurological/Ocular,"headaches, facial pain, dizziness or vertigo,..."
7,Cardiorespiratory,"shortness of breath, cough, chest pain, wheezi..."
8,Gastrointestinal,"nausea and/or vomiting, gastroesophageal reflu..."
9,Vascular/Lymphatic,"bruising, lymphadenopathy, laryngeal edema, ly..."


In [50]:
# Transforming the symptoms in the Symptoms column from string type to list type
symptom_categorisation['Symptoms'] = symptom_categorisation['Symptoms'].apply(lambda x: x.replace("'", '').split(','))

In [51]:
symptom_categorisation

Unnamed: 0,Category,Symptoms
0,Other Symptoms,"[sexual dysfunction, priapism, dyspareunia, ..."
1,General,"[fatigue, fever, bleeding, weight loss, we..."
2,Body Pain/Mobility,"[pain, joint pain, myalgia, back pain, pai..."
3,Mental Health/Psychological/Behavioral,"[stress, anxiety, depression, suffering, m..."
4,Sleep,"[insomnia, sleep apnea, sleep disorder, som..."
5,Otorhinolaryngology,"[sinonasal congestion, nasal symptoms, pain ..."
6,Neurological/Ocular,"[headaches, facial pain, dizziness or verti..."
7,Cardiorespiratory,"[shortness of breath, cough, chest pain, wh..."
8,Gastrointestinal,"[nausea and/or vomiting, gastroesophageal ref..."
9,Vascular/Lymphatic,"[bruising, lymphadenopathy, laryngeal edema,..."


In [53]:
# Creating a dictionary where each symptom is associated with its category
symptom_category_dict = {}

for i in range(len(symptom_categorisation)):
    for symptom in symptom_categorisation['Symptoms'][i]:
        symptom = symptom.strip()
        if symptom != '':
            symptom_category_dict[symptom] = symptom_categorisation['Category'][i]

all_symptoms_list = list(symptom_category_dict.keys())
original_symptom_list = pd.read_csv('post_covid19_symptom_lexicon.csv')['symptom']
len(original_symptom_list.to_list())

In [60]:
# Saving the dictionary in json format
symptom_synonym_category_dict = {k:symptom_category_dict[v] for (k, v) in lexicon_symptoms_dict_named.items()}
symptom_synonym_category_dict_json = json.dumps(symptom_synonym_category_dict)
with open("symptom_synonym_category_dict_2.json","w") as f:
    f.write(symptom_synonym_category_dict_json)