# medical NER
## BlueBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# 加载模型和分词器
model_name = "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# 准备文本
text = "Patient with severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is showing signs of improvement."

# 编码文本
inputs = tokenizer(text, return_tensors="pt")

# 预测实体
with torch.no_grad():
    outputs = model(**inputs)


# 解码预测结果
logits = outputs.logits
predicted_token_classes = logits.argmax(-1).squeeze().tolist()


tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
predictions = [model.config.id2label[token_class] for token_class in predicted_token_classes]


# 打印每个词及其预测实体类别
for token, prediction in zip(tokens, predictions):
    print(f"{token}: {prediction}")

## PubMedBERT

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# 加载模型和分词器

model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# 准备文本
text = "Recent studies on SARS-CoV-2 suggest potential vaccine targets."
text = "Patient with severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is showing signs of improvement."
text = 'CASE: A 28-year-old previously healthy man presented with a 6-week history of palpitations. The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea. Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings.'


# 编码文本
inputs = tokenizer(text, return_tensors="pt")

# 预测实体
with torch.no_grad():
    logits = model(**inputs).logits
# print('\n\n', logits)

# 解码预测结果
predicted_token_classes = logits.argmax(-1)
print('\n\n', predicted_token_classes)

tokens = inputs.tokens()
predictions = [model.config.id2label[predicted_token_classes[0][i].item()] for i in range(len(tokens))]

# 输出结果
for token, prediction in zip(tokens, predictions):
    print(f"{token}: {prediction}")


## RoBERTa-large

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("abymmathew/RoBERTa-large-PM-M3-Voc-hf-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("abymmathew/RoBERTa-large-PM-M3-Voc-hf-finetuned-ner")

text = 'CASE: A 28-year-old previously healthy man presented with a 6-week history of palpitations. The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea. Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings.'
text = "Recent studies on SARS-CoV-2 suggest potential vaccine targets."
text = "Patient with severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is showing signs of improvement."




# 编码文本
inputs = tokenizer(text, return_tensors="pt")

# 预测实体
with torch.no_grad():
    logits = model(**inputs).logits
    print(logits.shape)


# 解码预测结果
predicted_token_classes = logits.argmax(-1)
tokens = inputs.tokens()
predictions = [model.config.id2label[predicted_token_classes[0][i].item()] for i in range(len(tokens))]
for token, prediction in zip(tokens, predictions):
    print(f"{token}: {prediction}")


# print label
print(model.config.id2label)

# RadGraph MIMIC-CXR

In [4]:
graph_train_path = '/DATA1/llm-research/RadGraph/physionet.org/files/radgraph/1.0.0/train.json'

import json
with open(graph_train_path, 'r') as f:
    train_data = json.load(f)
import pprint
# pprint.pprint(train_data)

pprint.pprint(train_data['p18/p18004941/s58821758.txt']['entities'])

{'1': {'end_ix': 36,
       'label': 'ANAT-DP',
       'relations': [],
       'start_ix': 36,
       'tokens': 'Lungs'},
 '2': {'end_ix': 38,
       'label': 'OBS-DP',
       'relations': [['located_at', '1']],
       'start_ix': 38,
       'tokens': 'clear'},
 '3': {'end_ix': 40,
       'label': 'OBS-DP',
       'relations': [['located_at', '4'],
                     ['located_at', '5'],
                     ['located_at', '7']],
       'start_ix': 40,
       'tokens': 'Normal'},
 '4': {'end_ix': 41,
       'label': 'ANAT-DP',
       'relations': [],
       'start_ix': 41,
       'tokens': 'cardiomediastinal'},
 '5': {'end_ix': 43,
       'label': 'ANAT-DP',
       'relations': [],
       'start_ix': 43,
       'tokens': 'hilar'},
 '6': {'end_ix': 44,
       'label': 'ANAT-DP',
       'relations': [['modify', '4'], ['modify', '5']],
       'start_ix': 44,
       'tokens': 'silhouettes'},
 '7': {'end_ix': 46,
       'label': 'ANAT-DP',
       'relations': [],
       'start_ix': 46,
  

Dataset of MIMIC
>       '15': {'end_ix': 69,
              'label': 'ANAT-DP',
              'relations': [],
              'start_ix': 69,
              'tokens': 'heart'},
       '16': {'end_ix': 70,
              'label': 'ANAT-DP',
              'relations': [['modify',
                            '15']],
              'start_ix': 70,
              'tokens': 'border'},

Training dataset dose not have consider the entity with discontinuous tokens, so the model can not predict the entity with discontinuous tokens.
> German football team won the world and European championships in 1974 and 1972 respectively.

Should be:  
'World championship' and 'European championship'


In [None]:
train_data['p18/p18004941/s58821758.txt'].keys()

In [None]:
train_data['p18/p18004941/s58821758.txt']['text']

In [None]:
train_data['p18/p18004941/s58821758.txt']['text'].split(' ')[47]

predict_examples = [[g, label, word, e], [g, label, word, e], ...]

test_dataset = NerDataset(predict_examples

sampler = SequentialSampler(test_dataset)

data_loader = DataLoader(
    test_dataset,
    sampler=sampler,
    batch_size=32, # you can adjust evaluation batch size, we prefer using 32
    collate_fn=default_data_collator,
    drop_last=False,
)


In [None]:
a = {'abstract_id': 14145090,
 'text': 'velvet antlers vas are commonly used in traditional chinese medicine and invigorant and contain many PET components for health promotion the velvet antler peptide svap is one of active components in vas based on structural study the svap interacts with tgfÎ² receptors and disrupts the tgfÎ² pathway we hypothesized that svap prevents cardiac fibrosis from pressure overload by blocking tgfÎ² signaling SDRs underwent TAC tac or a sham operation T3 one month rats received either svap mgkgday or vehicle for an additional one month tac surgery induced significant cardiac dysfunction FB activation and fibrosis these effects were improved by treatment with svap in the heart tissue tac remarkably increased the expression of tgfÎ² and connective tissue growth factor ctgf ROS species C2 and the phosphorylation C2 of smad and ERK kinases erk svap inhibited the increases in reactive oxygen species C2 ctgf expression and the phosphorylation of smad and erk but not tgfÎ² expression in cultured cardiac fibroblasts angiotensin ii ang ii had similar effects compared to tac surgery such as increases in Î±smapositive CFs and collagen synthesis svap eliminated these effects by disrupting tgfÎ² IB to its receptors and blocking ang iitgfÎ² downstream signaling these results demonstrated that svap has antifibrotic effects by blocking the tgfÎ² pathway in CFs',
 'location': [63],
 'label': ['transverse aortic constriction']}

In [None]:
a['text'].split(' ')[63]

# Pure

In [None]:
input = '{"clusters": [[[6, 17], [32, 32]], [[4, 4], [55, 55], [91, 91]], [[58, 62], [64, 64], [79, 79]]], "sentences": [["This", "paper", "presents", "an", "algorithm", "for", "computing", "optical", "flow", ",", "shape", ",", "motion", ",", "lighting", ",", "and", "albedo", "from", "an", "image", "sequence", "of", "a", "rigidly-moving", "Lambertian", "object", "under", "distant", "illumination", "."], ["The", "problem", "is", "formulated", "in", "a", "manner", "that", "subsumes", "structure", "from", "motion", ",", "multi-view", "stereo", ",", "and", "photo-metric", "stereo", "as", "special", "cases", "."], ["The", "algorithm", "utilizes", "both", "spatial", "and", "temporal", "intensity", "variation", "as", "cues", ":", "the", "former", "constrains", "flow", "and", "the", "latter", "constrains", "surface", "orientation", ";", "combining", "both", "cues", "enables", "dense", "reconstruction", "of", "both", "textured", "and", "texture-less", "surfaces", "."], ["The", "algorithm", "works", "by", "iteratively", "estimating", "affine", "camera", "parameters", ",", "illumination", ",", "shape", ",", "and", "albedo", "in", "an", "alternating", "fashion", "."], ["Results", "are", "demonstrated", "on", "videos", "of", "hand-held", "objects", "moving", "in", "front", "of", "a", "fixed", "light", "and", "camera", "."]], "ner": [[[4, 4, "Generic"], [6, 17, "Task"], [20, 21, "Material"], [24, 26, "Material"], [28, 29, "OtherScientificTerm"]], [[32, 32, "Generic"], [42, 42, "Material"], [44, 45, "Material"], [48, 49, "Material"]], [[55, 55, "Generic"], [58, 62, "OtherScientificTerm"], [64, 64, "Generic"], [67, 67, "Generic"], [69, 69, "OtherScientificTerm"], [72, 72, "Generic"], [74, 75, "OtherScientificTerm"], [79, 79, "Generic"], [81, 88, "Task"]], [[91, 91, "Generic"], [95, 105, "Method"]], [[115, 118, "Material"]]], "relations": [[[4, 4, 6, 17, "USED-FOR"], [20, 21, 4, 4, "USED-FOR"], [24, 26, 20, 21, "FEATURE-OF"], [28, 29, 24, 26, "FEATURE-OF"]], [[42, 42, 44, 45, "CONJUNCTION"], [44, 45, 48, 49, "CONJUNCTION"]], [[58, 62, 55, 55, "USED-FOR"], [67, 67, 64, 64, "HYPONYM-OF"], [67, 67, 69, 69, "USED-FOR"], [67, 67, 72, 72, "CONJUNCTION"], [72, 72, 64, 64, "HYPONYM-OF"], [72, 72, 74, 75, "USED-FOR"], [79, 79, 81, 88, "USED-FOR"]], [[95, 105, 91, 91, "USED-FOR"]], []], "doc_key": "ICCV_2003_158_abs"}'


import json
import pprint
pprint.pprint(json.loads(input))

In [None]:
# use model allenai/scibert_scivocab_uncased
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir=cache_dir)

In [None]:
from flair.data import Sentence
from flair.nn import Classifier

# make a sentence 
sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

# load biomedical NER tagger
tagger = Classifier.load("hunflair")

# tag sentence
tagger.predict(sentence)

# new

In [9]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


1.12.1+cu102
10.2
7605
True
NVIDIA H100 PCIe


In [5]:
import torch

print(torch.__version__)

1.12.1+cu102
