# Perform negation detection on clinical text to accurately identify present medical conditions

In [None]:
import pandas as pd

df = pd.read_csv('clinical_sentences.csv')

In [None]:
keywords = {
    "pneumonia": ["pneumonia", "lung infection"],
    "diabetes": ["diabetes", "hyperglycemia", "diabetic"],
    "hypertension": ["hypertension", "high blood pressure"],
    "tuberculosis": ["tuberculosis", "TB"],
    "myocardial_infarction": ["myocardial infarction", "heart attack"],
    "asthma": ["asthma", "wheeze"],
    "hepatic_steatosis": ["hepatic steatosis", "fatty liver"],
    "eczema": ["eczema", "dermatitis"],
    "osteoporosis": ["osteoporosis", "bone loss"],
    "migraine": ["migraine", "headache"],
    "COPD": ["COPD", "chronic obstructive pulmonary disease"]
}

In [None]:
import re

neg_terms = r"\b(no|not|absence of|negative for|rules out|no evidence of|free of)\b"

In [None]:
def detect(text, cond):
  text = text.lower()
  found = False
  for k in keywords[cond]:
    for m in re.finditer(re.escape(k), text):
      found = True
      if not any(
          m.start() - n.end() <= 40 and n.end() < m.start()
          for n in re.finditer(neg_terms, text)
      ):
        return "Present"

  return "Absent" if found else None

In [None]:
out = []
for s in df['sentence']:
  row = {'Sentence': s}
  for cond in keywords:
    row[cond] = detect(s, cond)
  out.append(row)

In [None]:
pd.DataFrame(out).head(3)

Unnamed: 0,Sentence,pneumonia,diabetes,hypertension,tuberculosis,myocardial_infarction,asthma,hepatic_steatosis,eczema,osteoporosis,migraine,COPD
0,No evidence of pneumonia was observed in the c...,Absent,,,,,,,,,,
1,The patient presents with symptoms of diabetes...,,Present,,,,,,,,,
2,"Chest imaging shows clear lungs, free of infil...",,,,,,,,,,,
