In [2]:
import joblib
import spacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent, visualize_dep
from spellchecker import SpellChecker
from rapidfuzz import process, fuzz

In [3]:
nlp = medspacy.load()

# Load valid symptoms
symptom_columns = joblib.load('../models/symptom_columns.pkl')
valid_symptoms = [symptom for symptom in symptom_columns]

input_text = (
    "I have no headachees but I do have a fevver and depression. "
)

In [4]:
# Adding target rules
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = []
for symptom in valid_symptoms:
    target_rules.append(TargetRule(literal=symptom, category="SYMPTOM"))
target_matcher.add(target_rules)


In [5]:
# Preprocess the input text for typos
def correct_typos(input_text):
    corrected_tokens = []
    spell = SpellChecker()
    tokens = input_text.split()
    for token in tokens:
        if token.lower() not in spell:
            correction = spell.correction(token)
            corrected_tokens.append(correction if correction else token)
        else:
            corrected_tokens.append(token)
    return " ".join(corrected_tokens)

In [6]:
# After correcting typos, map symptoms to valid symptoms
def correct_symptom_candidate(candidate: str, valid_symptoms: list, threshold: int = 80) -> str:
    """
    Given a candidate word from the input text (e.g. "anxiety"), fuzzy-match it
    against the valid_symptoms list (which may contain multi-word phrases like 
    "anxiety and nervousness") and return the best matching valid symptom if the 
    fuzzy score is above the threshold. Otherwise, return the candidate as-is.
    """
    candidate = candidate.lower()
    best_match, score, _ = process.extractOne(candidate, valid_symptoms, scorer=fuzz.token_set_ratio)
    if score >= threshold:
        return best_match.lower()
    return candidate

def fuzzy_match_text(text: str, valid_symptoms: list, threshold: int = 80) -> str:
    tokens = text.split()
    corrected_tokens = [correct_symptom_candidate(token, valid_symptoms, threshold) for token in tokens]
    deduped_tokens = []
    for token in corrected_tokens:
        if not deduped_tokens or token != deduped_tokens[-1]:
            deduped_tokens.append(token)
    return " ".join(deduped_tokens)


In [7]:
# Use medspacy to locate valid symptoms
# Apply contextual anlysis to filter out negated symptoms
corrected_text = correct_typos(input_text)
fuzzy_text = fuzzy_match_text(corrected_text, valid_symptoms)
print(fuzzy_text)

doc = nlp(fuzzy_text)

i have no headache but i do have a fever anxiety and nervousness depression


In [8]:
final_symptoms = set()
for ent in doc.ents:
    if ent.label_ == "SYMPTOM" and not ent._.is_negated:
        final_symptoms.add(ent.text.lower())
print("Final Extracted Valid Symptoms:", list(final_symptoms))

Final Extracted Valid Symptoms: ['fever', 'depression', 'anxiety and nervousness']


In [9]:
# Visualize the entities and dependencies
visualize_dep(doc)