In [1]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install bitsandbytes
!pip install accelerate
!pip install tensorboard

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
import numpy as np
import transformers
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
ds = load_dataset("Shees7/Disease-recipes-conversational-data")

# Split into training and validation sets
split_ds = ds["train"].train_test_split(test_size=0.1, seed=42)
train_ds = split_ds["train"]
val_ds = split_ds["test"]
print(f"Training samples: {len(train_ds)}, Validation samples: {len(val_ds)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


conversational_disease_recipes.jsonl:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1199 [00:00<?, ? examples/s]

Training samples: 1079, Validation samples: 120


In [None]:
!huggingface-cli login --token YOUR_HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `local_llama3.1_8b` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `local_llama3.1_8b`


In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [6]:
# Prepare the model with 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Load the model with quantization config
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
adapter_path = "/content/drive/My Drive/checkpoint-100"
model = PeftModel.from_pretrained(base_model, adapter_path)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [9]:
# Configure LoRA
config = LoraConfig(
    r=16,  # dimension of the low-rank matrices
    lora_alpha=32,  # scaling factor for the weight matrices
    lora_dropout=0.05,  # dropout probability for LoRA layers
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

In [10]:
# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


In [11]:
# Format data for training
def format_conversation(example):
    messages = example["messages"]
    formatted_text = ""

    for message in messages:
        role = message["role"]
        content = message["content"]

        if role == "user":
            formatted_text += f"<s>[INST] {content} [/INST] "
        elif role == "assistant":
            formatted_text += f"{content}</s>"

    return {"formatted_text": formatted_text}


In [12]:
def tokenize_function(examples):
    return tokenizer(
        examples["formatted_text"],
        padding="max_length",
        truncation=True,
        max_length=2048,
        return_tensors="pt"
    )

In [13]:
# Process datasets - Apply formatting first
train_ds_formatted = train_ds.map(
    format_conversation,
    remove_columns=train_ds.column_names
)

val_ds_formatted = val_ds.map(
    format_conversation,
    remove_columns=val_ds.column_names
)

# Then tokenize
tokenized_train_ds = train_ds_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=["formatted_text"]
)

tokenized_val_ds = val_ds_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=["formatted_text"]
)

Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [14]:
# Set labels equal to input_ids
def set_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

tokenized_train_ds = tokenized_train_ds.map(set_labels, batched=True)
tokenized_val_ds = tokenized_val_ds.map(set_labels, batched=True)

Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [15]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/ai_recipe/",
    num_train_epochs=3,
    per_device_train_batch_size=2,        #2,
    per_device_eval_batch_size=2,         #2,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=50,                        #20,
    logging_dir="./logs",
    logging_steps=50,                     #10,
    save_strategy="steps",
    save_steps=50,                        #20,
    learning_rate=1e-4,                 #2e-4
    weight_decay=0.01,
    fp16=True,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    resume_from_checkpoint=True,
    greater_is_better=False,
)

In [16]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    # Filter out padding tokens
    mask = labels != -100
    labels = labels[mask]
    predictions = predictions[mask]

    # Calculate accuracy
    accuracy = np.mean(predictions == labels)

    return {
        "accuracy": accuracy,
    }

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train(resume_from_checkpoint=adapter_path)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
150,0.6201,0.48868
200,0.4842,0.451137
250,0.4592,0.44199
300,0.4063,0.441858


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
