In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

In [3]:
!pip freeze | grep bitsandbytes
!pip freeze | grep accelerate

bitsandbytes==0.42.0
accelerate @ git+https://github.com/huggingface/accelerate.git@649e65b542a5740fb5ce663bbd5af45ed426c06f


In [4]:
!python -m bitsandbytes

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

++++++++++++++++++ /usr/local CUDA PATHS +++++++++++++++++++
/usr/local/nvidia/lib64/libcuda.so
/usr/local/cuda-11.8/targets/x86_64-linux/lib/libcudart.so
/usr/local/cuda-11.8/compat/libcuda.so

+++++++++++++++ WORKING DIRECTORY CUDA PATHS +++++++++++++++


++++++++++++++++++ LD_LIBRARY CUDA PATHS +++++++++++++++++++
+++++++++++++ /usr/local/cuda/lib64 CUDA PATHS +++++++++++++

++++++++++++ /usr/local/nvidia/lib64 CUDA PATHS ++++++++++++
/usr/local/nvidia/lib64/libcuda.so
++++++++++++++++ /opt/conda/lib CUDA PATHS +++++++++++++++++
/opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda110.so
/opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
/opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so
/opt/conda/lib/python3.10/s

In [16]:
from transformers import (
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    AutoTokenizer,
    AutoConfig,
    PreTrainedModel,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)
from datasets import load_from_disk, Dataset
import bitsandbytes as bnb
import torch
import accelerate

In [17]:
model_name = "nlpie/distil-biobert"

In [18]:
qlora = False

if qlora:
    qlora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
        task_type="CAUSAL_LM"
    )

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        quantization_config=bnb_config,
    )
    
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, qlora_config)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
    )

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

  return self.fget.__get__(instance, owner)()
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [19]:
import pandas as pd
dataframe = pd.read_csv("/kaggle/input/altegrad-2023/text.csv")
dataset = Dataset.from_pandas(dataframe)

context_length = 256

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    return {"input_ids": outputs["input_ids"]}

tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset.column_names
)
split_dataset = tokenized_dataset.train_test_split(test_size=0.01)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
tokenized_dataset, split_dataset, train_dataset, eval_dataset

  0%|          | 0/34 [00:00<?, ?ba/s]

(Dataset({
     features: ['input_ids'],
     num_rows: 33691
 }),
 DatasetDict({
     train: Dataset({
         features: ['input_ids'],
         num_rows: 33354
     })
     test: Dataset({
         features: ['input_ids'],
         num_rows: 337
     })
 }),
 Dataset({
     features: ['input_ids'],
     num_rows: 33354
 }),
 Dataset({
     features: ['input_ids'],
     num_rows: 337
 }))

In [26]:
OUTPUT_DIR = "/kaggle/working/checkpoint"

if qlora:
    training_args = TrainingArguments(
        evaluation_strategy="steps",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,

        num_train_epochs=4,

        learning_rate=5e-4,
        lr_scheduler_type="cosine",

        save_steps=500,
        eval_steps=100,
        logging_steps=100,

        seed=42,
        fp16=True,
        optim="paged_adamw_8bit",

        warmup_steps=500,
        gradient_accumulation_steps=8,

        load_best_model_at_end=True,

        # resume_from_checkpoint=OUTPUT_DIR,
        output_dir=OUTPUT_DIR,
        save_total_limit=5,
        report_to="none"
    )
else:
    training_args = TrainingArguments(
        evaluation_strategy="steps",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,

        num_train_epochs=5,

        learning_rate=5e-4,
        lr_scheduler_type="cosine",

        save_steps=250,
        eval_steps=50,
        logging_steps=50,

        seed=42,
        fp16=False,

        weight_decay=0.1,
        warmup_steps=175,
        gradient_accumulation_steps=8,

        load_best_model_at_end=True,

        # resume_from_checkpoint=OUTPUT_DIR,
        output_dir=OUTPUT_DIR,
        save_total_limit=5,
        report_to="none"
    )

In [27]:
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = "[PAD]"
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)


In [28]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    data_collator=data_collator,
)
trainer.train()

Step,Training Loss,Validation Loss
50,0.5333,0.032736
100,0.0231,0.002331
150,0.0064,0.000988
200,0.004,0.000724
250,0.0038,0.000344
300,0.0028,0.001639
350,0.0045,0.000322
400,0.002,6.6e-05
450,0.0013,8.8e-05
500,0.0009,3.4e-05


KeyboardInterrupt: 

In [29]:
!zip -r checkpoint-250.zip /kaggle/working/checkpoint/checkpoint-250
!zip -r checkpoint-500.zip /kaggle/working/checkpoint/checkpoint-500

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/checkpoint/checkpoint-250/ (stored 0%)
  adding: kaggle/working/checkpoint/checkpoint-250/vocab.txt (deflated 49%)
  adding: kaggle/working/checkpoint/checkpoint-250/optimizer.pt (deflated 7%)
  adding: kaggle/working/checkpoint/checkpoint-250/tokenizer.json (deflated 70%)
  adding: kaggle/working/checkpoint/checkpoint-250/rng_state.pth (deflated 28%)
  adding: kaggle/working/checkpoint/checkpoint-250/trainer_state.json (deflated 72%)
  adding: kaggle/working/checkpoint/checkpoint-250/config.json (deflated 50%)
  adding: kaggle/working/checkpoint/checkpoint-250/generation_config.json (deflated 8%)
  adding: kaggle/working/checkpoint/checkpoint-250/scheduler.pt (deflated 49%)
  adding: kaggle/working/checkpoint/checkpoint-250/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/checkpoint/checkpoint-250/training_args.bin (deflated 49%)
  adding: kaggle/working/checkpoint/checkpoint-250/special_tokens_map.json (deflated 76%)
  adding: kaggle/working/chec

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/checkpoint/checkpoint-500/ (stored 0%)
  adding: kaggle/working/checkpoint/checkpoint-500/vocab.txt (deflated 49%)
  adding: kaggle/working/checkpoint/checkpoint-500/optimizer.pt (deflated 7%)
  adding: kaggle/working/checkpoint/checkpoint-500/tokenizer.json (deflated 70%)
  adding: kaggle/working/checkpoint/checkpoint-500/rng_state.pth (deflated 28%)
  adding: kaggle/working/checkpoint/checkpoint-500/trainer_state.json (deflated 77%)
  adding: kaggle/working/checkpoint/checkpoint-500/config.json (deflated 50%)
  adding: kaggle/working/checkpoint/checkpoint-500/generation_config.json (deflated 8%)
  adding: kaggle/working/checkpoint/checkpoint-500/scheduler.pt (deflated 49%)
  adding: kaggle/working/checkpoint/checkpoint-500/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/checkpoint/checkpoint-500/training_args.bin (deflated 49%)
  adding: kaggle/working/checkpoint/checkpoint-500/special_tokens_map.json (deflated 76%)
  adding: kaggle/working/chec