In [14]:

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")
# Load data
file_path = r"C:\Users\Michael\Downloads\NER\comments.xlsx"
df = pd.read_excel(file_path)
df = df.iloc[:1000]
patient_comments_col = 'careprovidercomments'
unique_entities_results = 'Unique Entities'
total_words_results = 'Total Words'
specificity_results = 'Specificity Score'

specificity_scores = []
unique_entities_list = []
total_words_list = []

# Define a function to tokenize and align predictions with labels
def tokenize_and_predict(text):
    # Tokenize the input text
    tokenized_inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    word_ids = tokenized_inputs.word_ids(batch_index=0)  # Word IDs map tokens back to words in the original text

    # Perform predictions
    outputs = model(**tokenized_inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])
    labels = [model.config.id2label[p.item()] for p in predictions[0]]

    aligned_labels = []
    previous_word_idx = None

    # Align predicted labels to tokens
    for word_idx, label in zip(word_ids, labels):
        if word_idx is None:
            aligned_labels.append(-100)  # Skip special tokens
        elif word_idx != previous_word_idx:
            aligned_labels.append(label)  # Label for the start of each new word
        else:
            aligned_labels.append(-100)  # Continue label alignment for subwords
        previous_word_idx = word_idx

    # Pair tokens with their corresponding labels
    token_label_pairs = [(token, label) for token, label in zip(tokens, aligned_labels) if label != -100]
    return token_label_pairs

def calculate_specificity(text):
    token_label_pairs = tokenize_and_predict(text)
    entity_list = []
    total_words = len(text.split())
    entity_count = 0
    for token, label in token_label_pairs:
        if label != "O":
            entity_count += 1
            entity_list.append(label)
    return entity_count/total_words, entity_list, total_words

for comment in df[patient_comments_col]:
    if isinstance(comment, str):  # Ensure that the comment is a string
        specificity_score, unique_entities, total_words = calculate_specificity(comment)
        
        # Append the results to the respective lists
        specificity_scores.append(specificity_score)
        unique_entities_list.append(unique_entities)
        total_words_list.append(total_words)
    else:
        specificity_scores.append(0)
        unique_entities_list.append([])
        total_words_list.append(0)


df[specificity_results] = specificity_scores
df[unique_entities_results] = unique_entities_list
df[total_words_results] = total_words_list

columns_to_keep = ['ID', 'unidentifiableid', 'Combined_Sentiment', 'Combined_Wait',
       'Updated_Wait',
       'BERT_Sentiment', 'BERT_Wait', 'BERT_MistakeMedical', 'careprovidercomments',
       'Medical_Mistakes', 'Clerical_Mistakes', 'Communication_Mistakes', 'Unique Entities', 'Total Words', 'Specificity Score']
df = df[columns_to_keep]
df.to_excel(file_path, index=False)

In [16]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")

entity_labels = model.config.id2label.values()
print("Entities recognized by the model:")
for label in entity_labels:
    print(label)
    print("\n\n")

Entities recognized by the model:
O



B-ACTIVITY



I-ACTIVITY



I-ADMINISTRATION



B-ADMINISTRATION



B-AGE



I-AGE



I-AREA



B-AREA



B-BIOLOGICAL_ATTRIBUTE



I-BIOLOGICAL_ATTRIBUTE



I-BIOLOGICAL_STRUCTURE



B-BIOLOGICAL_STRUCTURE



B-CLINICAL_EVENT



I-CLINICAL_EVENT



B-COLOR



I-COLOR



I-COREFERENCE



B-COREFERENCE



B-DATE



I-DATE



I-DETAILED_DESCRIPTION



B-DETAILED_DESCRIPTION



I-DIAGNOSTIC_PROCEDURE



B-DIAGNOSTIC_PROCEDURE



I-DISEASE_DISORDER



B-DISEASE_DISORDER



B-DISTANCE



I-DISTANCE



B-DOSAGE



I-DOSAGE



I-DURATION



B-DURATION



I-FAMILY_HISTORY



B-FAMILY_HISTORY



B-FREQUENCY



I-FREQUENCY



I-HEIGHT



B-HEIGHT



B-HISTORY



I-HISTORY



I-LAB_VALUE



B-LAB_VALUE



I-MASS



B-MASS



I-MEDICATION



B-MEDICATION



I-NONBIOLOGICAL_LOCATION



B-NONBIOLOGICAL_LOCATION



I-OCCUPATION



B-OCCUPATION



B-OTHER_ENTITY



I-OTHER_ENTITY



B-OTHER_EVENT



I-OTHER_EVENT



I-OUTCOME



B-OUTCOME



I-PERSONAL_BACKGROUND