In [11]:
#!pip install nltk

In [3]:
import nltk
import os
#run it once
# Define local nltk_data directory inside current folder
local_nltk_data = os.path.join(os.getcwd(), 'nltk_data')
# Download punkt fresh here
nltk.download('punkt_tab', download_dir=local_nltk_data)
nltk.data.path.append(local_nltk_data)

In [9]:
import pandas as pd
from rapidfuzz import fuzz
from nltk.tokenize import word_tokenize

# List of heart failure terms
HF_cat = [
    'congestive heart failure',
    'heart failure',
    'diastolic heart failure',
    'high output heart failure',
    'High-output congestive heart failure',
    'Low-output congestive heart failure',
    'X-linked intellectual disability-cardiomegaly-congestive heart failure syndrome'
]
HF_cat = [term.lower() for term in HF_cat]

# Sample dataframe, the sample data frame shows the diagnoses for different patients 
# and the NLP function labels them which one is heart failure (HF) and which one is not HF
df = pd.DataFrame({
    'diagnosis': [
        'Patient has congestive heart failure',
        'heart failure',
        'High output was noted in heart performance',
        'Nothing unusual found',
        'x linked cardiomegaly disability congestive heart failure syndrome',
        'Low-output heart failure'
    ]
})

# Tokenize the HF terms
HF_tokens = [word_tokenize(term) for term in HF_cat]

# Join token sets back to string (optional: you could also compare sets)
def fuzzy_token_match(text, threshold=85):
    tokens = word_tokenize(text.lower())
    joined_text = ' '.join(tokens)
    for hf_term_tokens in HF_tokens:
        joined_hf = ' '.join(hf_term_tokens)
        score = fuzz.partial_ratio(joined_hf, joined_text)
        if score >= threshold:
            return 'heart failure'
    return 'not heart failure'

df['HF_status'] = df['diagnosis'].apply(fuzzy_token_match)
print(df)


                                           diagnosis          HF_status
0               Patient has congestive heart failure      heart failure
1                                       heart failure      heart failure
2         High output was noted in heart performance  not heart failure
3                              Nothing unusual found  not heart failure
4  x linked cardiomegaly disability congestive he...      heart failure
5                           Low-output heart failure      heart failure
