## Binary Bag of Words Generation

This code generates a binary bag of words representation for 'comorbidities_other' column in the 'Canada_Hosp1_COVID_InpatientData'  dataset.

In [1]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Read the Excel file
data = pd.read_excel("Canada_Hosp1_COVID_InpatientData.xlsx")
data.head()

Unnamed: 0,id,reason_for_admission,age,sex,ethnicity,ethnicity_other,height,weight,comorbidities,comorbidities_other,...,pao2,pao2_fio2,ph,high_senstivity_cardiac_troponin,esr,inr,ferritin,d_dimer,crp,hs_crp
0,1,Fever [R50.9],74,Male,"""[]""",,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""","GERD (gastroesophageal reflux disease), Dement...",...,,,,,,1.1,,,223.0,
1,2,Pneumonia [J18.9],61,Female,"""[]""",,154.9,73.6,"""[\""Hypertension\"",\""Other\""]""","Closed fracture of fifth metatarsal bone, Dive...",...,,,,,,1.0,,2354.0,95.6,
2,3,Pneumonia [J18.9],58,Female,"""[]""",,,,"""[\""Hypertension\""]""",,...,,,,,,,,,,
3,4,Suspected COVID-19 virus infection [U07.2],94,Male,"""[]""",,182.8,66.2,"""[\""Hypertension\"",\""Other\""]""","Parkinson's Disease, Back pain, Benign Prostat...",...,,,,,,,,,,
4,5,"Febrile respiratory illness [J98.9, R50.9]",91,Male,"""[]""",,,,"""[\""Chronic cardiac disease (not hypertension)...","TAVI, Atrial fibrillation, Gastric Reflux, Pac...",...,,,,,,,,,,


In [3]:
# Select the 'comorbidities_other' column
comorbidities_data = data['comorbidities_other']

# Check for missing values
comorbidities_data.isnull().sum()

104

In [4]:
# Initialize NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/sejal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Tokenize and create binary bag of words for each patient_id
bag_of_words = {}
for index in range(len(comorbidities_data)):
    # Get the patient ID and convert comorbidities to string
    id_column = data.at[index, 'id']
    comorbidities = str(comorbidities_data[index])

    # Tokenize the comorbidities data
    tokens = word_tokenize(comorbidities.lower())

    # Filter out stop words and non-alphabetical words
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Create a dictionary of the patient ID and its binary BoW
    bag_of_words[id_column] = {token: 1 for token in filtered_tokens}
    
# Sort the patient IDs
sorted_ids = sorted(bag_of_words.keys())

In [6]:
# Create DataFrame from the binary bag of words and export to Excel
output_data = pd.DataFrame.from_dict(bag_of_words, orient='index').fillna(0)
output_data = output_data.reindex(sorted_ids)
output_data.insert(0, 'Patient_ID', output_data.index)

output_data.head()

Unnamed: 0,Patient_ID,gerd,gastroesophageal,reflux,disease,dementia,depression,barrett,esophagus,pneumonia,...,affective,acoustic,neuroma,ear,drum,ercp,sept,antral,ulcers,ogd
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Export DataFrame to Excel
output_data.to_excel('bag_of_words_comorbidities.xlsx', index=False)