In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
#!pip install collections

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━

In [2]:
import torch
import bitsandbytes
import peft
import accelerate
import transformers
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BertForSequenceClassification, DataCollatorWithPadding, EvalPrediction
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, TrainerCallback
from collections import Counter, defaultdict

Download Model - Bits & Bytes

In [15]:

model_id = "bert-large-uncased"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = BertForSequenceClassification.from_pretrained(model_id,
                                                        num_labels=5,
                                                        quantization_config=bnb_config,
                                                        device_map={"":0}
                                                        )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        #model=model
                                        )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prepare Model Training

In [16]:
# Prepare the model for training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [17]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 0 || all params: 183627781 || trainable%: 0.0


Lora - PEFT

In [18]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, # Attention head
    lora_alpha=32, # alpha scaling
    #target_modules=["q", "k", "v", "o"],
    lora_dropout=0.05,
    bias="none",
    #task_type="SEQ_2_SEQ"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 786432 || all params: 184414213 || trainable%: 0.426448692433484


PREPARE THE DATA

In [8]:
# Assuming label_to_index is defined globally
label_to_index = {}

# tokenize the dataset
def encode_batch(examples, label_to_index):

    prefix = 'Clasiffy: '

    # The "inputs" are the tokenized answer:
    inputs = [prefix + doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)

    # Convert labels to numerical if they are not already
    labels = [label_to_index.setdefault(label, len(label_to_index)) for label in examples['label']]
    model_inputs["labels"] = torch.tensor(labels)
    return model_inputs



In [9]:
# load the dataset
def load_split(split_name):
    dataset = load_dataset("ManuelAlv/Medical_Summaries")[split_name]

    #Map
    dataset = dataset.map(
        lambda examples: encode_batch(examples, label_to_index),
        batched=True,
        remove_columns=dataset.column_names,
        desc="Running tokenizer on " + split_name + " dataset",
    )
    dataset.set_format(type="torch", columns=["input_ids", "labels"])
    return dataset

ACCURACY

In [10]:
import numpy as np
!pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m962.2 kB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Training Enviroment - With Processed Data

In [None]:
# small batch size to fit in memory
batch_size = 4

training_args = TrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    logging_steps=200,
    output_dir="./training_output",
    fp16=True,
    overwrite_output_dir=True,
    remove_unused_columns=False,
    lr_scheduler_type="linear"
)

# create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # load the dataset
    train_dataset=load_split("train"),
    #eval_dataset=load_split("validation"),
    compute_metrics=compute_metrics
)

# Train
trainer.train()

Running tokenizer on train dataset:   0%|          | 0/10828 [00:00<?, ? examples/s]



Step,Training Loss
200,1.5555
400,1.5339


In [None]:
trainer.evaluate(load_split("test"))

{'eval_runtime': 40.9679,
 'eval_samples_per_second': 36.126,
 'eval_steps_per_second': 9.031,
 'epoch': 4.99}

EVALUATION

In [None]:
model = model_new_full

In [14]:
from torch.utils.data import DataLoader
import evaluate

# Take a small subset for manual testing
subset = load_split("test")
dataloader = DataLoader(subset, batch_size=4)

metric = evaluate.load("accuracy")
model.eval()
for batch in dataloader:
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'accuracy': 0.6500553709856035}

EVALUTAION - Option 2

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Take a small subset for manual testing
#subset = load_split("test")
dataloader = DataLoader(subset, batch_size=4)

model.eval()  # Set model to evaluation mode
all_preds, all_labels = [], []

for batch in dataloader:
    with torch.no_grad():
        outputs = model(**{k: v.to(model.device) for k, v in batch.items() if k != 'labels'}).logits
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        labels = batch['labels'].numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# Compute accuracy
print("Manual accuracy:", accuracy_score(all_labels, all_preds))


Manual accuracy: 0.08273349329067804


Save Model - PEFT MODEL

In [None]:
from huggingface_hub import notebook_login

notebook_login("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")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("ManuelAlv/PubMed_Classify_ClinicalBert_adapters")

adapter_model.safetensors:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ManuelAlv/PubMed_Classify_ClinicalBert_adapters/commit/7ec286d7a084bfcc3ce5d4a4af790242de2eebb7', commit_message='Upload model', commit_description='', oid='7ec286d7a084bfcc3ce5d4a4af790242de2eebb7', pr_url=None, pr_revision=None, pr_num=None)

Save - FULL MODEL

In [None]:
# login to upload the model
# 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
# from huggingface_hub import login
# login()

from huggingface_hub import HfApi
import torch
api = HfApi()

torch.save(model.state_dict(), 'pytorch_model.bin')

api.upload_file(
    path_or_fileobj="pytorch_model.bin",
    path_in_repo="pytorch_model.bin",
    # replace with your own username in order to upload
    repo_id="ManuelAlv/PubMed_Classify_ClinicalBert",
    repo_type="model",
)

pytorch_model.bin:   0%|          | 0.00/137M [00:00<?, ?B/s]

'https://huggingface.co/ManuelAlv/PubMed_Classify_ClinicalBert/blob/main/pytorch_model.bin'

Load Full Model - FROM PEFT

In [None]:
# Load the model
peft_model = "ManuelAlv/PubMed_Classify_ClinicalBert_adapters"
model_id = "ManuelAlv/PubMed_Classify_ClinicalBert"

In [None]:
from peft import PeftConfig, PeftModel
config = PeftConfig.from_pretrained(peft_model)
print(config.base_model_name_or_path)

adapter_config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

emilyalsentzer/Bio_ClinicalBERT


In [None]:
from peft import PeftConfig, PeftModel

config = PeftConfig.from_pretrained(peft_model)

model_original = BertForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    #"emilyalsentzer/Bio_ClinicalBERT",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    num_labels = 5,
    device_map={"":0}
    )

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path
                                          #"emilyalsentzer/Bio_ClinicalBERT"
                                          )

model_new_full = PeftModel.from_pretrained(model_original, peft_model)

print_trainable_parameters(model_new_full)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

trainable params: 0 || all params: 65846789 || trainable%: 0.0


MERGE WEIGHTS - 4-BITS MODEL

In [None]:
from huggingface_hub import hf_hub_download

filename = "pytorch_model.bin"

# Download the state dict from Hugging Face Hub
state_dict = torch.load(hf_hub_download(model_id, filename))

# Load the state dict into the model
model_new_full.load_state_dict(state_dict)


pytorch_model.bin:   0%|          | 0.00/137M [00:00<?, ?B/s]

<All keys matched successfully>

MERGE FULL MODEL

In [None]:
from huggingface_hub import hf_hub_download

filename = "pytorch_model.bin"

# Download the state dict from Hugging Face Hub
state_dict = torch.load(hf_hub_download(model_id, filename))

# Get the current state of the model
current_state_dict = model_new_full.state_dict()

# Filter the state_dict to only include keys that exist in the current model and have the same size
filtered_state_dict = {k: v for k, v in state_dict.items() if k in current_state_dict and current_state_dict[k].size() == v.size()}

model_new_full.load_state_dict(filtered_state_dict, strict=False)


_IncompatibleKeys(missing_keys=['base_model.model.classifier.weight', 'base_model.model.classifier.bias'], unexpected_keys=[])

In [None]:
# # Merge models
# model_new_full.to("cpu")
model_new_full.merge_and_unload()

# # Save model
# model_new_full.push_to_hub("ManuelAlv/IMDB_Classify_Bart" )
# tokenizer.push_to_hub("ManuelAlv/IMDB_Classify_Bart_tk")

# # Load the Models from here
# model_test = BertForSequenceClassification.from_pretrained("ManuelAlv/IMDB_Classify_Bart", num_labels = 10,
#    device_map={"":0})
# tokenizer = AutoTokenizer.from_pretrained("ManuelAlv/IMDB_Classify_Bart_tk")





BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear4bit(in_features=768, out_features=768, bias=True)
              (key): Linear4bit(in_features=768, out_features=768, bias=True)
              (value): Linear4bit(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear4bit(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

Test The Model

In [None]:
num_validation = 10

validation_dataset = load_split('validation', num_validation)

for i in range(num_validation):
    # load the input and label
    input_ids = validation_dataset[i]['input_ids'].unsqueeze(0).to(0)
    label_ids = validation_dataset[i]['labels'].unsqueeze(0).to(0)
    # use the model to generate the output
    output = model.generate(input_ids, max_length=1024)
    # convert the tokens to text
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    label_text = tokenizer.decode(label_ids[0], skip_special_tokens=True)

    print('Input:', input_text)
    print('Output:', output_text)
    print('Label:', label_text)
    print('---')