In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, DataCollatorWithPadding, Trainer, TrainingArguments, BertForSequenceClassification, pipeline
from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model
import torch
import pandas as pd
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("gretelai/symptom_to_diagnosis")

In [3]:
dataset = dataset.rename_column("output_text", "label")

In [4]:
sorted_labels = sorted(set(dataset['train']['label']))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

In [5]:
dataset['train'][0]

{'label': 'cervical spondylosis',
 'input_text': "I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak."}

### Load model

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [8]:

foundation_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
test_input = tokenizer([
    "first symptom",
    "second symptom"
], padding=True, truncation=True, return_tensors="pt").to(device)


In [10]:
test_logits = foundation_model(**test_input).logits

In [11]:
torch.softmax(test_logits, dim=1).argmax(dim=1).cpu()

tensor([2, 2])

### Make a pipeline from foundation model

In [12]:
classifier = pipeline("text-classification", model=foundation_model, tokenizer=tokenizer, device=device)

In [13]:
classifier(['first symptom', 'second symptom'])

[{'label': 'bronchial asthma', 'score': 0.0729617029428482},
 {'label': 'bronchial asthma', 'score': 0.07156827300786972}]

### Prepare Lora finetuning

In [14]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(foundation_model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_A): P

In [15]:
peft_model.print_trainable_parameters()

trainable params: 2,376,214 || all params: 111,875,372 || trainable%: 2.1240


In [16]:
for n, p in peft_model.bert.named_parameters():
    if 'lora' in n:
        print(f"{n}: {p.shape}")

encoder.layer.0.attention.self.query.lora_A.default.weight: torch.Size([64, 768])
encoder.layer.0.attention.self.query.lora_B.default.weight: torch.Size([768, 64])
encoder.layer.0.attention.self.value.lora_A.default.weight: torch.Size([64, 768])
encoder.layer.0.attention.self.value.lora_B.default.weight: torch.Size([768, 64])
encoder.layer.1.attention.self.query.lora_A.default.weight: torch.Size([64, 768])
encoder.layer.1.attention.self.query.lora_B.default.weight: torch.Size([768, 64])
encoder.layer.1.attention.self.value.lora_A.default.weight: torch.Size([64, 768])
encoder.layer.1.attention.self.value.lora_B.default.weight: torch.Size([768, 64])
encoder.layer.2.attention.self.query.lora_A.default.weight: torch.Size([64, 768])
encoder.layer.2.attention.self.query.lora_B.default.weight: torch.Size([768, 64])
encoder.layer.2.attention.self.value.lora_A.default.weight: torch.Size([64, 768])
encoder.layer.2.attention.self.value.lora_B.default.weight: torch.Size([768, 64])
encoder.layer.3.

In [17]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["input_text"], padding="max_length", truncation=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

{'train': Dataset({
    features: ['label', 'input_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 853
}), 'test': Dataset({
    features: ['label', 'input_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 212
})}


In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()*100}

In [19]:
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-lora",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()



Starting to train...


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/peft/peft_model.py", line 1446, in forward
    return self.base_model(
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 197, in forward
    return self.model.forward(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 1695, in forward
    outputs = self.bert(
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 1141, in forward
    encoder_outputs = self.encoder(
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 694, in forward
    layer_outputs = layer_module(
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 584, in forward
    self_attention_outputs = self.attention(
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 514, in forward
    self_outputs = self.self(
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pim/miniconda3/envs/ner/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 327, in forward
    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 21.96 GiB total capacity; 1.79 GiB already allocated; 20.00 MiB free; 1.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [67]:
trainer.evaluate()



{'eval_loss': 2.238173007965088,
 'eval_accuracy': 25.943396226415093,
 'eval_runtime': 2.4841,
 'eval_samples_per_second': 85.344,
 'eval_steps_per_second': 10.869,
 'epoch': 2.0}

In [12]:
peft_bert_model_path = "fine-tuned-peft-model-weights/"
peft_model.save_pretrained(peft_bert_model_path)

NameError: name 'peft_model' is not defined

In [13]:
config = PeftConfig.from_pretrained('fine-tuned-peft-model-weights/')

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                      num_labels=22)

model = PeftModel.from_pretrained(model, 'fine-tuned-peft-model-weights/')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
test_lora_example = tokenizer([
    "first symptom",
    "second symptom"
], padding=True, truncation=True, return_tensors="pt").to(device)

In [None]:
model.to(device)

In [18]:
model.device

device(type='cuda', index=0)

In [19]:
model(**test_lora_example).logits.argmax(dim=1).cpu()

tensor([16, 16])

### Bert from scratch