# Extraction of Most Common Comorbidity 

In [1]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## Preprocessing 

In [2]:
def pre_process(text):
    
    # lowercase
    text = text.lower()
    
    #remove special chacracters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    
    return text

In [3]:
covid_adv = pd.read_csv('covid_adv_pat.csv')
df = covid_adv.loc[:, ['VAERS_ID','CUR_ILL', 'HISTORY']]

df

Unnamed: 0,VAERS_ID,CUR_ILL,HISTORY
0,1410490,,
1,1413866,,
2,896636,none,none
3,902418,none,none
4,902440,,
...,...,...,...
404844,1427468,Unknown,Unknown
404845,1427471,,
404846,1427472,,
404847,1427475,,


In [4]:
# Fill missing value with 'No'
df['CUR_ILL'] = df['CUR_ILL'].fillna('No')
df['HISTORY'] = df['HISTORY'].fillna('No')

In [5]:
# Apply preprocessing 
df['CUR_ILL'] = df['CUR_ILL'].apply(lambda x: pre_process(x))
df['HISTORY'] = df['HISTORY'].apply(lambda x: pre_process(x))

In [6]:
# Combine current illness column with chronic health condition column
df['COMORBIDITY'] = df['CUR_ILL'] + " " + df['HISTORY']

## Creating vocabulary and word counts

Creating a sparse matrix representation of the word counts. Each column represents a word in the vocabulary. Each row represents the document in the dataset, where the values are the word counts.

CountVectorizer parameters:
- max_df = 0.85 : ignore all words that have appeared in 85% of the documents, since those may be unimportant.

- stop_words : a custom stopwords list.

The resulting shape of word_count_vector is (number of rows, size of vocabulary). 

The size of the vocaubulary can be limited by setting max_features = vocab_size when instantiating CounterVectorizer. 

In [7]:
# Remove stopwords using nltk library
stop_words = set(stopwords.words('english'))

In [8]:
docs = df['COMORBIDITY'].tolist()

In [9]:
# A sparse matrix represenation of the word counts
cv = CountVectorizer(max_df = 0.85, stop_words = stop_words, max_features = 10000)
word_count_vector = cv.fit_transform(docs)
word_count_df = pd.DataFrame(word_count_vector.toarray(),
                             columns = cv.get_feature_names())

In [10]:
word_count_df

Unnamed: 0,aa,aaa,ab,abated,abcess,abd,abdomen,abdominal,abdominis,abdominoplasty,...,zithromycin,zocor,zofran,zoloft,zolpidem,zone,zoster,zpack,zpak,zyrtec
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# A series list including keywords and its total appearance
total_count = word_count_df.sum(axis = 0)
total_count_df = pd.DataFrame({ 'keyword' : total_count.index,
                              'total_count': total_count.values})
total_count_df = total_count_df.sort_values(by = ['total_count'], ascending = False)

## Extracting common comorbidities

In [12]:
total_count_df

Unnamed: 0,keyword,total_count
6070,none,212802
4012,history,41316
5418,medical,29368
676,asthma,26269
3984,high,24819
...,...,...
6309,opmd,3
3385,flx,3
966,bills,3
2291,dense,3


In [13]:
total_count_df[1:20]

Unnamed: 0,keyword,total_count
4012,history,41316
5418,medical,29368
676,asthma,26269
3984,high,24819
1853,conditions,24571
1847,concurrent,23340
9451,unknown,22596
1034,blood,19868
4249,hypertension,19425
2397,diabetes,18086


In [14]:
total_count_df[20:40]

Unnamed: 0,keyword,total_count
605,arthritis,11157
2319,depression,10953
4319,hypothyroidism,10036
6469,pain,9523
2527,disorder,9490
1533,cholesterol,9103
3853,heart,8882
7658,reported,8805
4120,htn,8526
9364,type,8232


In [15]:
total_count_df[40:60]

Unnamed: 0,keyword,total_count
9083,thyroid,5638
811,back,5374
9968,years,5199
179,adverse,5118
1958,copd,5001
4841,kidney,4974
3280,fibromyalgia,4962
3026,event,4940
7138,prior,4568
5585,mild,4459


## Creating comorbidity features

by matching strings 

#### Extracted comorbidities

1. Asthma
2. Hypertension
3. Diabetes
4. Anxiety
5. Allergy
6. Arthritis
7. Depression
8. Hypothyroidsim
9. High Cholesterol
10. Heart Disease
11. Gerd
12. Cancer
13. Obesity
14. Migraines
15. Kidney Disease
16. Covid positive

In [16]:
# string match function
def strmatch(disease_category, column):
    index_list = []
    match_list = []
    for index, disease in column.items():
        if re.findall(r"(?=("+'|'.join(disease_category)+r"))", str(disease).lower()):
            match_list.append(disease)
            index_list.append(index)
        
    return index_list, match_list 

In [17]:
# Creating a dictionary for saving comorbidity and corresponding index 
comorbidity_idx = {}

#### 1. Asthma

In [18]:
disease_string = ['asthma']
asthma_idx, asthma_match = strmatch(disease_string, df['COMORBIDITY'])

In [19]:
comorbidity_idx['Asthma_c'] = asthma_idx

#### 2. Hypertension

In [20]:
disease_string = ['hypertension', 'high blood pressure', 'blood pressure high']
hypertension_idx, hypertension_match = strmatch(disease_string, df['COMORBIDITY'])

In [21]:
comorbidity_idx['Hypertension_c'] = hypertension_idx

#### 3. Diabetes

In [22]:
disease_string = ['diabetes', 'diabetic']
diabetes_idx, diabetes_match = strmatch(disease_string, df['COMORBIDITY'])

In [23]:
comorbidity_idx['Diabetes_c'] = diabetes_idx

#### 4. Anxiety

In [24]:
disease_string = ['anxiety']
anxiety_idx, anxiety_match = strmatch(disease_string, df['COMORBIDITY'])

In [25]:
comorbidity_idx['Anxiety_c'] = anxiety_idx

#### 5. Allergy

In [26]:
disease_string = ['allergy', 'allergies', 'allergic']
allergy_idx, allergy_match = strmatch(disease_string, df['COMORBIDITY'])

In [27]:
comorbidity_idx['Allergy_c'] = allergy_idx

#### 6. Arthritis

In [28]:
disease_string = ['arthritis', 'osteoarthritis', 'fibromyalgia']
arthritis_idx, arthritis_match = strmatch(disease_string, df['COMORBIDITY'])

In [29]:
comorbidity_idx['Arthritis_c'] = arthritis_idx

#### 7. Depression

In [30]:
disease_string = ['depression']
depression_idx, depression_match = strmatch(disease_string, df['COMORBIDITY'])

In [31]:
comorbidity_idx['Depression_c'] = depression_idx

#### 8. Hypothyroidism 

In [32]:
disease_string = ['hypothyroidism', 'underactive thyroid', 'low thyroid', 'hypothyreosis'
                 'under active thyroid', 'thyroid low', 'hypothyroid']
hypothyroid_idx, hypothyroid_match = strmatch(disease_string, df['COMORBIDITY'])

In [33]:
comorbidity_idx['Hypothyroidsm_c'] = hypothyroid_idx

#### 9. High Cholesterol

In [34]:
disease_string = ['cholesterol', 'hyperlipidemia']
cholesterol_idx, cholesterol_match = strmatch(disease_string, df['COMORBIDITY'])

In [35]:
comorbidity_idx['High_Cholesterol_c'] = cholesterol_idx

#### 10. Heart Disease

In [36]:
disease_string = ['heart']
heart_idx, heart_match = strmatch(disease_string, df['COMORBIDITY'])

In [37]:
comorbidity_idx['Heart_Disease_c'] =heart_idx

#### 11. Gerd

In [38]:
disease_string = ['gerd', 'gastroesophageal reflux disease', 'chronic acid reflux']
gerd_idx, gerd_match = strmatch(disease_string, df['COMORBIDITY'])

In [39]:
comorbidity_idx['GERD_c'] = gerd_idx

#### 12. Cancer

In [40]:
disease_string = ['cancer', 'malignant', 'malignancy', 'sarcoma', 'carcinoma', 'melanoma',
                  'lymphoma', 'leukemia']
cancer_idx, cancer_match = strmatch(disease_string, df['COMORBIDITY'])

In [41]:
comorbidity_idx['Cancer_c'] = cancer_idx

#### 13. Obesity

In [42]:
disease_string = ['obesity', 'obese', 'adiposity', 'overweight']
obesity_idx, obesity_match = strmatch(disease_string, df['COMORBIDITY'])

In [43]:
comorbidity_idx['Obesity_c'] = obesity_idx

#### 14. Migraines




In [44]:
disease_string = ['migraines', 'migraine', 'headache', 'cephalgia']
migraine_idx, migraine_match = strmatch(disease_string, df['COMORBIDITY'])

In [45]:
comorbidity_idx['Migraine_c'] = migraine_idx

#### 15. Kidney Disease


In [46]:
disease_string = ['kidney', 'renal']
kidney_idx, kidney_match = strmatch(disease_string, df['COMORBIDITY'])

In [47]:
comorbidity_idx['Kidney_Disease_c'] = kidney_idx

#### 16. Covid Positive

In [48]:
disease_string = ['covid', 'covid positive']
covid_idx, covid_match = strmatch(disease_string, df['COMORBIDITY'])

In [49]:
comorbidity_idx['COVID19_Positive'] = covid_idx

#### Creating feature columns

In [50]:
comorbidity_ID = {}

for key in comorbidity_idx:
    comorbidity_ID[key] = [df.iloc[i].VAERS_ID for i in comorbidity_idx[key]]

for key in comorbidity_idx:
    lst = ['Yes' if ID in comorbidity_ID[key] else 'No' for ID in covid_adv.VAERS_ID]
    covid_adv[key] = lst

In [51]:
# Sanity check 
covid_adv.loc[covid_adv['Depression_c'] == 'Yes', ['CUR_ILL', 'HISTORY']]

Unnamed: 0,CUR_ILL,HISTORY
64,,"Depression, Stress Incontinence"
98,none,"major depression, generalized anxiety disorder"
172,Acute migraine,"Migraines, Depression"
211,,"Hypertension, pre-diabetes, anxiety, panic att..."
243,,"Migraine, anxiety, depression, hyperlipidemia"
...,...,...
404727,,"Depression, anxiety, IBS, fatty liver, RLS , e..."
404808,,"anxiety, asthma, seasonal allergies, depression"
404817,,Depression and Anxiety
404826,no,migraine depression


In [52]:
covid_adv.drop(columns = ['CUR_ILL', 'HISTORY'], inplace = True)

In [53]:
covid_adv

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,DIED,DISABLE,BIRTH_DEFECT,L_THREAT,HOSPITAL,VAX_MANU,Asthma_c,...,Depression_c,Hypothyroidsm_c,High_Cholesterol_c,Heart_Disease_c,GERD_c,Cancer_c,Obesity_c,Migraine_c,Kidney_Disease_c,COVID19_Positive
0,1410490,54.0,F,No,No,No,No,No,PFIZER\BIONTECH,No,...,No,No,No,No,No,No,No,No,No,No
1,1413866,67.0,M,No,No,No,No,No,MODERNA,No,...,No,No,No,No,No,No,No,No,No,No
2,896636,47.0,F,No,No,No,No,No,MODERNA,No,...,No,No,No,No,No,No,No,No,No,No
3,902418,56.0,F,No,No,No,No,No,PFIZER\BIONTECH,No,...,No,No,No,No,No,No,No,No,No,No
4,902440,35.0,F,No,No,No,No,No,PFIZER\BIONTECH,No,...,No,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404844,1427468,17.0,F,No,No,No,No,No,PFIZER\BIONTECH,No,...,No,No,No,No,No,No,No,No,No,No
404845,1427471,18.0,M,No,No,No,No,No,MODERNA,No,...,No,No,No,No,No,No,No,No,No,No
404846,1427472,54.0,F,No,No,No,No,No,MODERNA,No,...,No,No,No,No,No,No,No,No,No,No
404847,1427475,87.0,F,Yes,No,No,No,Yes,MODERNA,No,...,No,No,No,No,No,No,No,No,No,No


In [54]:
covid_adv.to_csv('covid_adv_comorbidity.csv', index = False)