In [5]:
text = "Psilocybin is a psychedelic compound found in magic mushrooms, often in the Psilocybe genus. It's used in spiritual rituals, recreationally, and as medicine. Psilocybin has hallucinogenic effects. It can be obtained from both fresh and dried mushrooms in varying concentrations. It can also be created in a lab. There's increased interest in using pure psilocybin for addictions, depression, and other mental and psychological disorders due to its potential to stimulate certain areas of the brain. People use psilocybin for alcohol use disorder and other addictions, anxiety, depression, migraines, PTSD, and many other conditions, but there is no good scientific evidence to support these uses. Psilocybin is illegal under federal law in the US. It is classified as a Schedule I controlled substance."

In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
tagger = SequenceTagger.load('ner-ontonotes')

sentences = [Sentence(sent, use_tokenizer=True) for sent in split_single(text)]
tagger.predict(sentences)

# write ner results to file
with open("../output/ner_results.txt", "w") as f:
    for sent in sentences:
        for entity in sent.get_spans('ner'):
            f.write(f"{entity}\n")
        

2024-05-07 17:49:11,791 SequenceTagger predicts: Dictionary with 75 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-GPE, B-GPE, E-GPE, I-GPE, S-ORG, B-ORG, E-ORG, I-ORG, S-DATE, B-DATE, E-DATE, I-DATE, S-CARDINAL, B-CARDINAL, E-CARDINAL, I-CARDINAL, S-NORP, B-NORP, E-NORP, I-NORP, S-MONEY, B-MONEY, E-MONEY, I-MONEY, S-PERCENT, B-PERCENT, E-PERCENT, I-PERCENT, S-ORDINAL, B-ORDINAL, E-ORDINAL, I-ORDINAL, S-LOC, B-LOC, E-LOC, I-LOC, S-TIME, B-TIME, E-TIME, I-TIME, S-WORK_OF_ART, B-WORK_OF_ART, E-WORK_OF_ART, I-WORK_OF_ART, S-FAC


In [7]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
text = "Psilocybin is a psychedelic compound found in magic mushrooms, often in the Psilocybe genus. It's used in spiritual rituals, recreationally, and as medicine. Psilocybin has hallucinogenic effects. It can be obtained from both fresh and dried mushrooms in varying concentrations. It can also be created in a lab. There's increased interest in using pure psilocybin for addictions, depression, and other mental and psychological disorders due to its potential to stimulate certain areas of the brain. People use psilocybin for alcohol use disorder and other addictions, anxiety, depression, migraines, PTSD, and many other conditions, but there is no good scientific evidence to support these uses. Psilocybin is illegal under federal law in the US. It is classified as a Schedule I controlled substance."

results = pipe(text)
with open("../output/ner_results.txt", "w") as f:
    for entity in results:
        f.write(f"{entity}\n")


In [9]:
text = "Coltsfoot is a plant. It is native to Europe and parts of Asia. It has been introduced to North America. The leaf, flower, and root are used to make medicine. Despite serious safety concerns, coltsfoot is used for asthma, cough, sore throat, swelling of the airways, and other conditions, but there is no good scientific evidence to support these uses."

from sklearn.feature_extraction.text import TfidfVectorizer

# Define a list of symptoms
symptoms_list = ["asthma", "cough", "sore throat", "swelling"]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(vocabulary=symptoms_list)

# Fit the vectorizer and transform the text
tfidf_matrix = vectorizer.fit_transform([text])

# Get feature names (symptoms) with non-zero TF-IDF values
feature_names = vectorizer.get_feature_names_out()

# Extract symptoms with non-zero TF-IDF values
symptoms = [feature_names[idx] for idx in tfidf_matrix.nonzero()[1]]

print("Symptoms:", symptoms)


Symptoms: ['swelling', 'cough', 'asthma']


In [11]:
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent

# Load medspacy model
nlp = medspacy.load()
print(nlp.pipe_names)

text = """
Past Medical History:
1. Atrial fibrillation
2. Type II Diabetes Mellitus

Assessment and Plan:
There is no evidence of pneumonia. Continue warfarin for Afib. Follow up for management of type 2 DM.
"""

# Add rules for target concept extraction
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = [
    TargetRule("atrial fibrillation", "PROBLEM"),
    TargetRule("atrial fibrillation", "PROBLEM", pattern=[{"LOWER": "afib"}]),
    TargetRule("pneumonia", "PROBLEM"),
    TargetRule("Type II Diabetes Mellitus", "PROBLEM", 
              pattern=[
                  {"LOWER": "type"},
                  {"LOWER": {"IN": ["2", "ii", "two"]}},
                  {"LOWER": {"IN": ["dm", "diabetes"]}},
                  {"LOWER": "mellitus", "OP": "?"}
              ]),
    TargetRule("warfarin", "MEDICATION")
]
target_matcher.add(target_rules)

doc = nlp(text)
visualize_ent(doc)


['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']
