In [1]:
from transformers import pipeline

# Load the zero-shot classification pipeline with a model similar to BERT (RoBERTa-large-mnli)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define severity levels as possible labels
labels = ["Very Low", "Low", "Moderate", "High", "Very High"]

# Example medical report (replace this with your actual text data)
example_report = """
THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AGAINST THE DEVICE RESULTING IN AN UNSPECIFIED ADVERSE OUTCOME.
THE PROCEDURE PERFORMED WAS A TOTAL VAGINAL HYSTERECTOMY WITH MESH IMPLANT, AND CYSTOSCOPY.
"""

# Perform zero-shot classification on the medical report
result = classifier(example_report, candidate_labels=labels)

# Display the most probable severity level
print(f"Predicted severity level: {result['labels'][0]}")


2024-09-23 09:30:17.828141: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Predicted severity level: High


In [13]:
example_report = """
"(B)(4). IF INFORMATION IS PROVIDED IN THE FUTURE, A SUPPLEMENTAL REPORT WILL BE ISSUED."
"""

In [15]:
labels = ["Lowest severity", "Mild severity", "Moderate severity", "High severity", "Most severe"]

# Perform zero-shot classification on the medical report
result = classifier(example_report, candidate_labels=labels)

# Display the most probable severity level
print(f"Predicted severity level: {result['labels'][0]}")

Predicted severity level: Mild severity


In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import pandas as pd

In [19]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', timeout=60)  # Increase timeout
model = BertModel.from_pretrained('bert-base-uncased')




In [None]:
# Function to get BERT embeddings for a given text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Mean of token embeddings

# Load your dataset (replace with actual file path)
data = pd.read_csv("./data/stress_urinary_incontinence.csv")

# Get BERT embeddings for each text entry in the dataset
embeddings = np.array([get_bert_embedding(text) for text in data['FOI_TEXT']])

# Apply K-Means clustering (5 clusters for 5 severity levels)
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data['cluster'] = kmeans.fit_predict(embeddings)

# Map clusters to severity levels manually based on the cluster characteristics
cluster_to_severity = {0: 'Lowest severity', 1: 'Mild severity', 2: 'Moderate severity', 3: 'High severity', 4: 'Most severe'}
data['severity_label'] = data['cluster'].map(cluster_to_severity)

# Save the labeled dataset to a CSV file
data.to_csv('clustered_maude_data.csv', index=False)

# Display the severity levels assigned to a sample of the dataset
print(data[['cleaned_text', 'severity_label']].head())


In [None]:
data['cluster'] = kmeans.fit_predict(embeddings)

In [24]:
embeddings = np.vstack(embeddings)

In [25]:
data['cluster'] = kmeans.fit_predict(embeddings)



In [27]:
cluster_to_severity = {0: 'Lowest severity', 1: 'Mild severity', 2: 'Moderate severity', 3: 'High severity', 4: 'Most severe'}
data['severity_label'] = data['cluster'].map(cluster_to_severity)

print(data[['FOI_TEXT', 'severity_label']].head())
# Save the labeled dataset to a CSV file
data.to_csv('./data/clustered_maude_data.csv', index=False)

# Display the severity levels assigned to a sample of the dataset

                                            FOI_TEXT     severity_label
0  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  Moderate severity
1  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  Moderate severity
2  IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...        Most severe
3  MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...  Moderate severity
4  THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AG...    Lowest severity


train

In [2]:
from sklearn.model_selection import train_test_split

In [3]:

# Load your dataset (replace with actual file path)
data = pd.read_csv('./data/clustered_maude_data.csv')

# Map severity labels to numeric values
severity_to_label = {
    'Lowest severity': 0,
    'Mild severity': 1,
    'Moderate severity': 2,
    'High severity': 3,
    'Most severe': 4
}
data['label'] = data['severity_label'].map(severity_to_label)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['FOI_TEXT'].tolist(), 
    data['label'].tolist(), 
    test_size=0.2, 
    random_state=42
)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert to PyTorch Dataset format
class MAUDEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MAUDEDataset(train_encodings, train_labels)
val_dataset = MAUDEDataset(val_encodings, val_labels)


In [8]:
from transformers import DistilBertForSequenceClassification

# Load pre-trained DistilBERT model for classification with 5 labels
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',             # Output directory
    evaluation_strategy="epoch",        # Evaluate after each epoch
    per_device_train_batch_size=32,     # Larger batch size for faster training
    per_device_eval_batch_size=64,      # Larger batch size for evaluation
    num_train_epochs=2,                 # Reduce number of epochs
    weight_decay=0.01,                  # Regularization
    logging_dir='./logs',               # Log directory
    logging_steps=10,                   # Log every 10 steps
    save_total_limit=2,                 # Keep only last 2 checkpoints
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The DistilBERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset             # Validation dataset
)


In [11]:
# Train the model
trainer.train()

# Save the model after training
model.save_pretrained('./fine_tuned_distilbert_severity_model')



  0%|          | 0/482 [00:00<?, ?it/s]

{'loss': 1.2721, 'grad_norm': 2.5493381023406982, 'learning_rate': 4.896265560165975e-05, 'epoch': 0.04}


KeyboardInterrupt: 