## TF-IDF Feature Extraction

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [6]:
# Load the data
df = pd.read_excel('Canada_Hosp1_COVID_InpatientData.xlsx')
df.head()

Unnamed: 0,id,reason_for_admission,age,sex,ethnicity,ethnicity_other,height,weight,comorbidities,comorbidities_other,...,pao2,pao2_fio2,ph,high_senstivity_cardiac_troponin,esr,inr,ferritin,d_dimer,crp,hs_crp
0,1,Fever [R50.9],74,Male,"""[]""",,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""","GERD (gastroesophageal reflux disease), Dement...",...,,,,,,1.1,,,223.0,
1,2,Pneumonia [J18.9],61,Female,"""[]""",,154.9,73.6,"""[\""Hypertension\"",\""Other\""]""","Closed fracture of fifth metatarsal bone, Dive...",...,,,,,,1.0,,2354.0,95.6,
2,3,Pneumonia [J18.9],58,Female,"""[]""",,,,"""[\""Hypertension\""]""",,...,,,,,,,,,,
3,4,Suspected COVID-19 virus infection [U07.2],94,Male,"""[]""",,182.8,66.2,"""[\""Hypertension\"",\""Other\""]""","Parkinson's Disease, Back pain, Benign Prostat...",...,,,,,,,,,,
4,5,"Febrile respiratory illness [J98.9, R50.9]",91,Male,"""[]""",,,,"""[\""Chronic cardiac disease (not hypertension)...","TAVI, Atrial fibrillation, Gastric Reflux, Pac...",...,,,,,,,,,,


In [7]:
# Get the "comorbidities_other" column
comorbidities = df['comorbidities_other']

# Replace null values with empty strings
comorbidities.fillna("", inplace=True)

# Initialize NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/sejal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Data cleaning function
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join the tokens back into a single string
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [9]:
# Clean the comorbidities data
cleaned_comorbidities = comorbidities.apply(clean_text)

In [10]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.9)

# Fit the vectorizer to the cleaned data
vectorizer.fit(cleaned_comorbidities)

In [11]:
# Initialize a dictionary to store features for each patient id
features_dict = {}

# Extract features for each patient id
for index, comorbidity in enumerate(cleaned_comorbidities):
    id_column = df.at[index, 'id']
    features = vectorizer.transform([comorbidity])
    features_dict[id_column] = features.toarray().reshape(-1)

In [12]:
# Create a DataFrame from the features dictionary
features_df = pd.DataFrame.from_dict(features_dict, orient='index', columns=vectorizer.get_feature_names_out())

# Rename the first column as 'id'
features_df = features_df.rename(columns={'Unnamed: 0': 'id'})

In [13]:
# Sort the features based on their mean TF-IDF score
sorted_features = features_df.mean().sort_values(ascending=False)
print(features_df.mean())
print(sorted_features)

aaa          0.001188
abdominal    0.003625
abnormal     0.002128
absence      0.000822
abuse        0.003531
               ...   
vessel       0.002309
visual       0.001525
vitamin      0.004516
wall         0.002469
years        0.001905
Length: 436, dtype: float64
cholesterol       0.068144
dyslipidemia      0.067095
high              0.065549
disease           0.043965
hyperlipidemia    0.033750
                    ...   
nonalcoholic      0.000816
delirium          0.000776
cataracts         0.000707
reduced           0.000639
echocardiogram    0.000528
Length: 436, dtype: float64


In [14]:
# Extract the top 50 features
top_50_features = sorted_features[:50]

In [15]:
# Create a DataFrame with only the top 50 features
top_50_features_df = features_df[top_50_features.index]

# Export the DataFrame to Excel
top_50_features_df.to_csv('top_150_comorbidities_tfidf_features.csv', index_label='id')