In [1]:
# Install necessary libraries if not already installed
!pip install transformers datasets torch scikit-learn

# Import required libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments, pipeline
import numpy as np

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

# Load the datasets
patient_notes = pd.read_csv('/content/patient_notes.csv')
features = pd.read_csv('/content/features.csv')
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

# Merge datasets to associate notes with their corresponding features and annotations
data = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
data = data.merge(features, on=['feature_num', 'case_num'], how='left')

# Preprocess the text data
def preprocess_text(text):
    return text.strip().lower()

data['pn_history'] = data['pn_history'].apply(preprocess_text)
data['annotation'] = data['annotation'].apply(preprocess_text)

# Use a smaller subset for quick testing
data = data.sample(frac=0.1, random_state=42)  # Adjust the fraction as needed

# Tokenization and Encoding
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_and_align_labels(texts, annotations):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    labels = []

    for i, text in enumerate(texts):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(word_ids)  # -100 will be ignored by the loss function

        # Assume annotations are token-level indices; adjust as needed
        annotation_spans = [int(x) for x in annotations[i].split(';') if x.isdigit()]

        for start_idx in annotation_spans:
            for idx, word_id in enumerate(word_ids):
                if word_id is not None and start_idx <= word_id < start_idx + len(annotation_spans):
                    label_ids[idx] = 1  # 1 for positive annotation

        labels.append(label_ids)

    tokenized_inputs["labels"] = torch.tensor(labels)
    return tokenized_inputs

# Split the data into training and validation sets
train_texts, val_texts, train_annotations, val_annotations = train_test_split(
    data['pn_history'].tolist(),
    data['annotation'].tolist(),
    test_size=0.1,  # Use a smaller validation set
    random_state=42
)

# Tokenize and align labels for both training and validation sets
train_encodings = tokenize_and_align_labels(train_texts, train_annotations)
val_encodings = tokenize_and_align_labels(val_texts, val_annotations)

# Create a Dataset class for the encodings
class ClinicalNERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create dataset objects
train_dataset = ClinicalNERDataset(train_encodings)
val_dataset = ClinicalNERDataset(val_encodings)

# Load a smaller, faster model (DistilBERT)
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Increase epochs for better learning
    per_device_train_batch_size=16,  # Adjust based on your memory
    per_device_eval_batch_size=16,
    warmup_steps=100,  # Fewer warmup steps to speed up training
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",
    save_steps=500,
    eval_steps=500,
    fp16=True,  # Enable mixed precision training if supported by your GPU
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Make predictions on the validation set
predictions, true_labels, _ = trainer.predict(val_dataset)

# Convert predictions to label IDs
pred_ids = np.argmax(predictions, axis=2)

# Remove ignored index (-100) from labels
true_labels = np.array(true_labels)
true_labels[true_labels == -100] = 0  # Convert ignored index to neutral for easy handling

# Define a mapping (e.g., B-SYMPTOM, I-CONDITION)
label_map = {0: 'O', 1: 'B-SYMPTOM', 2:'I-CONDITION'}

# Decode predictions only
decoded_preds = [[label_map[p] for p in pred] for pred in pred_ids]

# Display a few sample predictions
print("Sample Predictions:")
for i in range(5):  # Display the first 5 predictions
    print(f"\nPatient Note {i+1}:")
    print(val_texts[i])
    print(f"Predicted Labels: {decoded_preds[i]}")
    print("-" * 50)

# Summarization Pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load and summarize the 'pn_history' column instead of 'text'
texts = patient_notes['pn_history'].tolist()

# Generate summaries for all texts
summaries = []

for i, text in enumerate(texts[:5]):  # Limit to first 5 for quick testing
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])
    print(f"Summary for text {i+1}:")
    print(summary[0]['summary_text'])
    print("-" * 50)



Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Evaluation Results: {'eval_loss': nan, 'eval_runtime': 62.1891, 'eval_samples_per_second': 2.299, 'eval_steps_per_second': 0.145, 'epoch': 3.0}
Sample Predictions:

Patient Note 1:
patient is a 35 y/o m with no pmh presents with epigastric pain for 2 months. the pain is described as burning pain, no radiation, 5/10, partially relieved with toms but not helping recently, associated with nausea, bloasting and stomach fullness. denies food regurgitation, vomiting, diarrhea, constipation, weakness , numbness, fever, headache, chest pain, sob. pt has been using motrin excessicvely for back aches and muscle pain. he works in construction. he also takes 5-6 cups of coffee every day.
ros: -ve except hpi
allergy : none
meds: motrin, toms
pmh: none
psh: none
fh: uncle had pud
sh: drinks alcohol, few beers a week but has decreased intake, smoking 1/2-1 packs per day since age 15, no illicit drug use, exercises regularly
Predicted Labels: ['O', 'B-SYMPTOM', 'B-SYMPTOM', 'B-SYMPTOM', 'B-SYMPTOM', '

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Your max_length is set to 130, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Summary for text 1:
17-year-old male, has come to the student health clinic complaining of heart pounding. Cleveland's mother has given verbal consent for a history, physical examination, and treatment.
--------------------------------------------------
Summary for text 2:
17 yo male with recurrent palpitations for the past 3 mo lasting about 3 - 4 min. Has tried aterol to be able to better concentrate, has received it from his roommate.
--------------------------------------------------
Summary for text 3:
Dillon Cleveland, 17, presents with complaints of heart pounding. He cannot think of any triggers, and it has occurred both with activity and at rest. Occasionally, it is accompanied by chest pressure but not pain that is located at the center of his chest.
--------------------------------------------------
Summary for text 4:
A 17 yo m c/o palpitation started 3 mos ago. NOT ASSOCIATED WITH NAUSEA VOMITING; HEADACHE; ABDOMINAL PAIN; CHANGES IN URINATION OR BOWEL HABITS, OR TREMOR OR