In [1]:
# Pip installs
!pip install transformers torch
!pip install transformers[torch]
!pip install accelerate -U



In [11]:
# Define the label to id and id to label mappings
label2id = {"O": 0, "B-ALLERGY": 1, "I-ALLERGY": 2}
id2label = {0: "O", 1: "B-ALLERGY", 2: "I-ALLERGY"}

In [12]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import os

# Setting path
base_path = "D:/Abdullah/Documents/MediMatch"
model_relative_path = os.path.join("Abdul/NLP_Models/Allergies")
model_path = os.path.join(base_path, model_relative_path)

# Load model and tokenizers
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [15]:
# Example text here
text = "Im allergic to cats and dogs and horses"

# Tokenize
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = logits.argmax(-1).squeeze().tolist()

# Maps prediciton to entity labels
predicted_labels = [id2label[label] for label in predictions]

# Combine tokens and labels
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
labeled_tokens = list(zip(tokens, predicted_labels))

def aggregate_entities(tokens, predicted_labels):
    entities = []
    current_entity = []
    current_label = None

    for token, label in zip(tokens, predicted_labels):
        # Skip over the special tokens
        if token in ["[CLS]", "[SEP]"]:
            continue

        # Subword tokens
        if token.startswith("##"):
            if current_entity:
                current_entity[-1] += token[2:]  # Merge with the previous token
            continue

        # BIO tags
        if label.startswith("B-"):
            if current_entity:  # Save the previous
                entities.append((current_label, " ".join(current_entity)))
            current_entity = [token]
            current_label = label[2:]  # Remove prefix
        elif label.startswith("I-") and current_label == label[2:]:
            current_entity.append(token)
        else:  
            if current_entity:  # Save the previous
                entities.append((current_label, " ".join(current_entity)))
                current_entity = []
                current_label = None

    if current_entity:
        entities.append((current_label, " ".join(current_entity)))

    return entities

entities = aggregate_entities(tokens, predicted_labels)

def filter_entities(entities, filter_words):
    filtered_entities = []
    for label, entity in entities:
        # Split entity into words and filter out the unwanted ones
        filtered_entity_words = [word for word in entity.split() if word.lower() not in filter_words]
        if filtered_entity_words: 
            filtered_entities.append((label, " ".join(filtered_entity_words)))
    return filtered_entities

# Words to be filtered out of output
filter_words = {"of", "from", "to", "and"}
filtered_entities = filter_entities(entities, filter_words)

# Print outptu
for label, entity in filtered_entities:
    print(f"{label}: {entity}")

ALLERGY: cats
ALLERGY: dogs
ALLERGY: horses


In [66]:

def process_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(-1).squeeze().tolist()
    predicted_labels = [id2label[label] for label in predictions]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    labeled_tokens = list(zip(tokens, predicted_labels))

    def aggregate_entities(tokens, predicted_labels):
        entities = []
        current_entity = []
        current_label = None
        for token, label in zip(tokens, predicted_labels):
            if token in ["[CLS]", "[SEP]"]:
                continue
            if token.startswith("##"):
                if current_entity:
                    current_entity[-1] += token[2:]
                continue
            if label.startswith("B-"):
                if current_entity:
                    entities.append((current_label, " ".join(current_entity)))
                current_entity = [token]
                current_label = label[2:]
            elif label.startswith("I-") and current_label == label[2:]:
                current_entity.append(token)
            else:
                if current_entity:
                    entities.append((current_label, " ".join(current_entity)))
                    current_entity = []
                    current_label = None
        if current_entity:
            entities.append((current_label, " ".join(current_entity)))
        return entities

    entities = aggregate_entities(tokens, predicted_labels)

    def filter_entities(entities, filter_words):
        filtered_entities = []
        for label, entity in entities:
            filtered_entity_words = [word for word in entity.split() if word.lower() not in filter_words]
            if filtered_entity_words:
                filtered_entities.append((label, " ".join(filtered_entity_words)))
        return filtered_entities

    filter_words = {"of", "from", "to", "and"}
    filtered_entities = filter_entities(entities, filter_words)
    return filtered_entities

# Test data geneerated from CHATGPT
test_data = [
    "I'm allergic to peanuts."
    "I'm allergic to cats and dust mites."
    "I'm allergic to cats, dogs, and cheese."
    "I'm allergic to shellfish."
    "I'm allergic to pollen and grass."
    "I'm allergic to gluten'."
    "I'm allergic to milk."
    "I'm allergic to latex and bananas."
    "I have a seafood allergy."
    "I'm allergic to dogs."
    "I'm allergic to ragweed pollen."
    "I have lactose intolerance."
    "I have allergies to dogs"
    "I'm allergic to bee stings."
    "I'm allergic to shellfish and peanuts."
    "I have asthma triggered by dust mites."
    "I'm allergic to mold and pollen."
    "I'm allergic to dairy products."
    "I have a sesame allergy."
    "I'm allergic to eggs and soy."
    "I'm allergic to shellfish and fish."
    "I'm allergic to peanuts and milk."
    "I'm allergic to pollen, grass, and mold."
]

# Process each phrase in test_data
for text in test_data:
    filtered_entities = process_text(text)
    for label, entity in filtered_entities:
        print(f"{label}: {entity}")
    print("------")


ALLERGY: peanuts
ALLERGY: cats
ALLERGY: dust mites
ALLERGY: cats , dogs
ALLERGY: shellfish
ALLERGY: pollen
ALLERGY: grass
ALLERGY: gluten
ALLERGY: milk
ALLERGY: latex
ALLERGY: bananas
ALLERGY: dogs
ALLERGY: ragweed pollen
ALLERGY: bee stings
ALLERGY: shellfish
ALLERGY: peanuts
ALLERGY: by dust mites
ALLERGY: mold
ALLERGY: pollen
ALLERGY: dairy products
ALLERGY: eggs
ALLERGY: shellfish
ALLERGY: fish
ALLERGY: peanuts
ALLERGY: milk
ALLERGY: pollen
------


In [68]:
def process_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(-1).squeeze().tolist()
    predicted_labels = [id2label[label] for label in predictions]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    labeled_tokens = list(zip(tokens, predicted_labels))

    def aggregate_entities(tokens, predicted_labels):
        entities = []
        current_entity = []
        current_label = None
        for token, label in zip(tokens, predicted_labels):
            if token in ["[CLS]", "[SEP]"]:
                continue
            if token.startswith("##"):
                if current_entity:
                    current_entity[-1] += token[2:]
                continue
            if label.startswith("B-"):
                if current_entity:
                    entities.append((current_label, " ".join(current_entity)))
                current_entity = [token]
                current_label = label[2:]
            elif label.startswith("I-") and current_label == label[2:]:
                current_entity.append(token)
            else:
                if current_entity:
                    entities.append((current_label, " ".join(current_entity)))
                    current_entity = []
                    current_label = None
        if current_entity:
            entities.append((current_label, " ".join(current_entity)))
        return entities

    entities = aggregate_entities(tokens, predicted_labels)

    def filter_entities(entities, filter_words):
        filtered_entities = []
        for label, entity in entities:
            filtered_entity_words = [word for word in entity.split() if word.lower() not in filter_words]
            if filtered_entity_words:
                filtered_entities.append((label, " ".join(filtered_entity_words)))
        return filtered_entities

    filter_words = {"of", "from", "to", "and"}
    filtered_entities = filter_entities(entities, filter_words)
    return filtered_entities

# Test data geneerated from CHATGPT
test_data = [
"I find joy in cooking healthy meals with fresh ingredients."
"I prioritize regular physical activity to maintain my fitness levels."
"I make time for hobbies that bring me joy and reduce stress."
"I prioritize self-care activities like baths and massages for relaxation."
"I enjoy family picnics as a way to bond and enjoy nutritious meals together."
"I prioritize sleep hygiene practices for better sleep and overall health."
"I make time for laughter and humor to improve my mood and well-being."
"I prioritize healthy snacks like fruits and nuts for sustained energy."
"I find joy in spending time with pets for my mental and emotional health."
"I enjoy swimming as a refreshing and low-impact form of exercise."
"I prioritize relaxation techniques like meditation for stress relief."
"I make time for regular family walks to catch up and connect."
"I prioritize regular check-ups to monitor my health and well-being."
"I find joy in exploring nature through activities like birdwatching."
"I prioritize spending time in nature to recharge and rejuvenate."
"I enjoy dancing to my favorite music as a form of exercise and self-expression."
"I prioritize spending time with loved ones to maintain strong relationships."
"I make time for hobbies like painting or crafting to relax and unwind."
"I enjoy cooking healthy meals as a way to nourish my body and soul."
"I prioritize time for relaxation and self-care to reduce stress levels."
]

# Process each phrase in test_data
for text in test_data:
    filtered_entities = process_text(text)
    for label, entity in filtered_entities:
        print(f"{label}: {entity}")
    print("------")

------


In [69]:
# Given values
Total = 53
True_Positives = 18
False_Positives = 5
total_empty = 20
True_Negatives = 20
False_Negatives = 0

# Calculate Accuracy
accuracy = (True_Positives + True_Negatives) / Total

# Calculate Precision
precision = True_Positives / (True_Positives + False_Positives)

# Calculate Recall (Sensitivity)
recall = True_Positives / (True_Positives + False_Negatives)

# Calculate Specificity
specificity = True_Negatives / (True_Negatives + False_Positives)

# Calculate F1 Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Specificity: {specificity:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.72
Precision: 0.78
Recall: 1.00
Specificity: 0.80
F1 Score: 0.88
