In [1]:
from transformers import pipeline

# Load the zero-shot classification pipeline with a model similar to BERT (RoBERTa-large-mnli)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define severity levels as possible labels
labels = ["Very Low", "Low", "Moderate", "High", "Very High"]

# Example medical report (replace this with your actual text data)
example_report = """
THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AGAINST THE DEVICE RESULTING IN AN UNSPECIFIED ADVERSE OUTCOME.
THE PROCEDURE PERFORMED WAS A TOTAL VAGINAL HYSTERECTOMY WITH MESH IMPLANT, AND CYSTOSCOPY.
"""

# Perform zero-shot classification on the medical report
result = classifier(example_report, candidate_labels=labels)

# Display the most probable severity level
print(f"Predicted severity level: {result['labels'][0]}")


2024-09-23 09:30:17.828141: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Predicted severity level: High


In [13]:
example_report = """
"(B)(4). IF INFORMATION IS PROVIDED IN THE FUTURE, A SUPPLEMENTAL REPORT WILL BE ISSUED."
"""

In [15]:
labels = ["Lowest severity", "Mild severity", "Moderate severity", "High severity", "Most severe"]

# Perform zero-shot classification on the medical report
result = classifier(example_report, candidate_labels=labels)

# Display the most probable severity level
print(f"Predicted severity level: {result['labels'][0]}")

Predicted severity level: Mild severity


In [15]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import pandas as pd

In [19]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', timeout=60)  # Increase timeout
model = BertModel.from_pretrained('bert-base-uncased')




In [None]:
# Function to get BERT embeddings for a given text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Mean of token embeddings

# Load your dataset (replace with actual file path)
data = pd.read_csv("./data/stress_urinary_incontinence.csv")

# Get BERT embeddings for each text entry in the dataset
embeddings = np.array([get_bert_embedding(text) for text in data['FOI_TEXT']])

# Apply K-Means clustering (5 clusters for 5 severity levels)
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data['cluster'] = kmeans.fit_predict(embeddings)




In [16]:
num_clusters = 5
for cluster in range(num_clusters):
    print(f"Cluster {cluster}:")
    print(data[data['cluster'] == cluster]['FOI_TEXT'].sample(5, random_state=42).tolist())
    print("\n")

Cluster 0:
['IT WAS REPORTED TO BOSTON SCIENTIFIC CORPORATION THAT A ADVANTAGE WAS IMPLANTED INTO THE PATIENT ON (B)(6) 2007. THE PATIENT EXPERIENCED COMPLICATIONS, FURTHER SURGERY, AND NONSURGICAL TREATMENT. PATIENT SYMPTOMS INCLUDE: BACK PAIN; THI PAIN; OFF VAG DIS; DIFF BOWEL; REC INCONT; AGGRAV INCONT; PSYCH; OTH PAIN: LEG WEAKNESS, STOMACH AREA. NONSURGICAL TREATMENTS: THE PATIENT WAS TREATED WITH PAIN MEDICATION. THE PATIENT WAS TREATED WITH OTHER MEDICATION (PLEASE SPECIFY): ANTIBIOTICS (CONSTANT USE) FOR THE TREATMENT OF: BLADDER INFECTIONS. ON (B)(6) 2007 THE PATIENT COMMENCED PHYSIOTHERAPY TREATMENT (INCLUDING PELVIC FLOOR EXERCISES OR TRAINING). TREATMENT DURATION: SLOW DOWN AFTER 2015 . THE PATIENT WAS TREATED WITH TOPICAL TREATMENT (INCLUDING OESTROGEN CREAM).', 'NOTE: TWO BOSTON SCIENTIFIC MESH DEVICES WERE IMPLANTED INTO THE SAME PATIENT. THIS REPORT PERTAINS TO THE ADVANTAGE FIT. IT WAS REPORTED TO BOSTON SCIENTIFIC CORPORATION THAT AN ADVANTAGE FIT SLING AND A PINNACLE

In [None]:
# Map clusters to severity levels manually based on the cluster characteristics
cluster_to_severity = {0: 'Lowest severity', 1: 'Mild severity', 2: 'Moderate severity', 3: 'High severity', 4: 'Most severe'}
data['severity_label'] = data['cluster'].map(cluster_to_severity)

# Save the labeled dataset to a CSV file
data.to_csv('clustered_maude_data.csv', index=False)

# Display the severity levels assigned to a sample of the dataset
print(data[['cleaned_text', 'severity_label']].head())

In [None]:
data['cluster'] = kmeans.fit_predict(embeddings)

In [24]:
embeddings = np.vstack(embeddings)

In [25]:
data['cluster'] = kmeans.fit_predict(embeddings)



In [27]:
cluster_to_severity = {4: 'Lowest severity', 3: 'Mild severity', 2: 'Moderate severity', 1: 'High severity', 0: 'Most severe'}
data['severity_label'] = data['cluster'].map(cluster_to_severity)

print(data[['FOI_TEXT', 'severity_label']].head())
# Save the labeled dataset to a CSV file
data.to_csv('./data/clustered_maude_data.csv', index=False)

# Display the severity levels assigned to a sample of the dataset

                                            FOI_TEXT     severity_label
0  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  Moderate severity
1  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  Moderate severity
2  IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...        Most severe
3  MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...  Moderate severity
4  THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AG...    Lowest severity


train

In [2]:
from sklearn.model_selection import train_test_split

In [3]:

# Load your dataset (replace with actual file path)
data = pd.read_csv('./data/clustered_maude_data.csv')

# Map severity labels to numeric values
severity_to_label = {
    'Lowest severity': 4,
    'Mild severity': 3,
    'Moderate severity': 2,
    'High severity': 1,
    'Most severe': 0
}
data['label'] = data['severity_label'].map(severity_to_label)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['FOI_TEXT'].tolist(), 
    data['label'].tolist(), 
    test_size=0.2, 
    random_state=42
)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert to PyTorch Dataset format
class MAUDEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MAUDEDataset(train_encodings, train_labels)
val_dataset = MAUDEDataset(val_encodings, val_labels)


In [8]:
from transformers import DistilBertForSequenceClassification

# Load pre-trained DistilBERT model for classification with 5 labels
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',             # Output directory
    evaluation_strategy="epoch",        # Evaluate after each epoch
    per_device_train_batch_size=32,     # Larger batch size for faster training
    per_device_eval_batch_size=64,      # Larger batch size for evaluation
    num_train_epochs=2,                 # Reduce number of epochs
    weight_decay=0.01,                  # Regularization
    logging_dir='./logs',               # Log directory
    logging_steps=10,                   # Log every 10 steps
    save_total_limit=2,                 # Keep only last 2 checkpoints
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The DistilBERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset             # Validation dataset
)


In [None]:
# Train the model
trainer.train()

# Save the model after training
model.save_pretrained('./fine_tuned_distilbert_severity_model')



In [12]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./fine_tuned_distilbert_severity_model')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Set the model to evaluation mode and move it to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['High severity', 'High severity', 'High severity', 'High severity', 'High severity']


In [14]:
# New text data for prediction
new_texts = [
    "(B)(4). IF INFORMATION IS PROVIDED IN THE FUTURE, A SUPPLEMENTAL REPORT WILL BE ISSUED.",
    "There was an issue with the product, but it was resolved without any adverse outcomes.",
    "A malfunction occurred during the procedure, leading to a minor injury.",
    "The device has been reported to cause serious side effects, including organ damage.",
    "BASED ON ADDITIONAL INFORMATION RECEIVED THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT. IF INFORMATION IS PROVIDED IN THE FUTURE, A SUPPLEMENTAL REPORT WILL BE ISSUED."
]

# Tokenize the new texts
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")

# Move input tensors to the same device as the model
new_encodings = {key: val.to(device) for key, val in new_encodings.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Convert predictions back to severity labels
label_to_severity = {0: 'Lowest severity', 1: 'Mild severity', 2: 'Moderate severity', 3: 'High severity', 4: 'Most severe'}
predicted_severity = [label_to_severity[pred.item()] for pred in predictions]

print(predicted_severity)

['Most severe', 'High severity', 'High severity', 'High severity', 'Moderate severity']


In [None]:
import torch

# Ensure the model is in evaluation mode
model.eval()

# Make predictions (no gradient computation needed during evaluation)
with torch.no_grad():
    outputs = model(**encodings)

# Get the predicted class labels
predictions = torch.argmax(outputs.logits, dim=-1)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# Map model output (integers) back to label names
label_mapping = {0: 'Lowest severity', 1: 'Mild severity', 2: 'Moderate severity', 3: 'High severity', 4: 'Most severe'}

predicted_labels = [label_mapping[p.item()] for p in predictions]


In [None]:
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=list(label_mapping.values()))

# Display confusion matrix
ConfusionMatrixDisplay(conf_matrix, display_labels=list(label_mapping.values())).plot()
