In [1]:
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import joblib



In [2]:
from transformers import BertForTokenClassification, BertTokenizerFast

model = BertForTokenClassification.from_pretrained("without_anomalies_PUBMEDBERT_model")
tokenizer = BertTokenizerFast.from_pretrained("without_anomalies_PUBMEDBERT_tokenizer")

In [3]:
label_encoder_path = "version2_label_encoder.joblib"
label_encoder = joblib.load(label_encoder_path)

In [4]:
text = "patient: my mom was diagnosed with bacterial pneumonia back in late december. after a round of antibiotics, she felt much better but the xray still showed pneumonia. a round of stronger antibiotics were then given to her. she went back for a follow up xray last week and the xray now shows the pneumonia is in both lungs. the strange thing is...she has no symptoms. no coughing, no fever, nothing. could it possibly be viral? or fluid? or cancer???? btw she has never smoked in her life."

In [12]:
text = "patient: hi, may i answer your health queries right now ? please type your query here...i am 42 year old female with known case of pneumonia and constant coughing and low body temp of 35.3c feeling anemic advise please ? currently using bioxin and guafinicine/codein"

In [20]:
text = "patient: dear doctor my littel son 11 months when he was 2 months and 15 days he got chest infection pneumonia in the right lung and after 2 weeks of treatment in the hospital with antibiotic he discharged next day he was admitted again with new infected by pneumonia in the right lung and then doctor did some test a barium swallow and ph. the resulte ge reflux grade 4. he was with ngt and ar formula by ngt for 3 moths with zantac and motinorm also with sitting position ,but all of those dosn t change any thing.after that when he was 5 months and 15 days he had an a laparoscopic fundoplication 5 ports, 360m degree wrap over 10fr ryles tube. he trying to vomit but he can t.i observed every time when he trying to vomit like for 5 to 7 minutes and when he sneeze every time after trying vomit then he got back to normal after sneezing.. and he drinking now neocate formula based on free amino acid. also he is allergy with lots of food ,."

In [28]:
text = "patient: i have tonsillitis long time ago and every year i feel allergic problem in my thort. about 7 days i feel same allergic problem in my thort and tongue i have 1 canker sore. i take just anti histamine. but ago i treat this with prednisone. does it covid 19?"

In [29]:
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

predicted_label_indices = predictions[0].numpy()
predicted_labels = [label_encoder.inverse_transform([idx])[0] for idx in predicted_label_indices]

In [30]:
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
word_labels = []
current_word = ""
current_label = None

for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels.append((current_word, current_label))
        current_word = token
        current_label = label

if current_word:
    word_labels.append((current_word, current_label))

In [31]:
word_labels

[('[CLS]', 'O'),
 ('patient', 'O'),
 (':', 'O'),
 ('i', 'O'),
 ('have', 'O'),
 ('tonsillitis', 'B-Disease_disorder'),
 ('long', 'O'),
 ('time', 'O'),
 ('ago', 'O'),
 ('and', 'O'),
 ('every', 'B-Frequency'),
 ('year', 'I-Duration'),
 ('i', 'O'),
 ('feel', 'O'),
 ('allergic', 'B-Sign_symptom'),
 ('problem', 'I-Sign_symptom'),
 ('in', 'O'),
 ('my', 'O'),
 ('thort', 'B-Activity'),
 ('.', 'O'),
 ('about', 'B-Duration'),
 ('7', 'I-Duration'),
 ('days', 'I-Date'),
 ('i', 'O'),
 ('feel', 'O'),
 ('same', 'O'),
 ('allergic', 'B-Sign_symptom'),
 ('problem', 'I-Sign_symptom'),
 ('in', 'O'),
 ('my', 'O'),
 ('thort', 'B-Activity'),
 ('and', 'O'),
 ('tongue', 'O'),
 ('i', 'O'),
 ('have', 'O'),
 ('1', 'B-Lab_value'),
 ('canker', 'B-Sign_symptom'),
 ('sore', 'B-Sign_symptom'),
 ('.', 'O'),
 ('i', 'O'),
 ('take', 'O'),
 ('just', 'O'),
 ('anti', 'B-Medication'),
 ('histamine', 'I-Medication'),
 ('.', 'O'),
 ('but', 'O'),
 ('ago', 'O'),
 ('i', 'O'),
 ('treat', 'O'),
 ('this', 'O'),
 ('with', 'O'),
 ('pred

In [32]:
word_labels_df = pd.DataFrame(word_labels, columns=['Word', 'Label'])
word_labels_df

Unnamed: 0,Word,Label
0,[CLS],O
1,patient,O
2,:,O
3,i,O
4,have,O
5,tonsillitis,B-Disease_disorder
6,long,O
7,time,O
8,ago,O
9,and,O


In [33]:
filtered_df = word_labels_df[~word_labels_df['Word'].isin(['[PAD]'])]
filtered_df = filtered_df[~filtered_df['Word'].isin(['[CLS]'])]
filtered_df = filtered_df[~filtered_df['Word'].isin(['[SEP]'])]
filtered_df

Unnamed: 0,Word,Label
1,patient,O
2,:,O
3,i,O
4,have,O
5,tonsillitis,B-Disease_disorder
6,long,O
7,time,O
8,ago,O
9,and,O
10,every,B-Frequency


In [34]:
test_preds_df = filtered_df
test_preds_df['Label'] = test_preds_df['Label'].str.replace(r'^(B-|I-)', '', regex=True)
test_preds_df

Unnamed: 0,Word,Label
1,patient,O
2,:,O
3,i,O
4,have,O
5,tonsillitis,Disease_disorder
6,long,O
7,time,O
8,ago,O
9,and,O
10,every,Frequency


In [35]:
test_preds_df.to_csv('NER4.csv', index=False)