# Phi 3.5 mini instruct

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
#Install the required packages for this project
!pip install transformers datasets bitsandbytes accelerate peft
!pip install scikit-learn
!pip install torch --upgrade
!pip install evaluate
!pip install flash-attn
!pip install wandb
!pip install logging
!pip install huggingface-hub

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting flash-attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24
  Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826
Successfully built 

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, TrainerCallback
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import hashlib
import random
import evaluate
import numpy as np
from huggingface_hub import notebook_login
import time
import math
import warnings
import wandb
import logging
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.utils.checkpoint")
from torch.utils.data import DataLoader

In [None]:
from huggingface_hub import login

# Replace 'your_access_token_here' with your actual access token
login(token="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")

# Login to Hugging Face
notebook_login()

# Verify login
from huggingface_hub import HfApi

api = HfApi()

try:
    user_info = api.whoami()
    print("Successfully authenticated! User info:", user_info)
except Exception as e:
    print("Authentication error:", e)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Successfully authenticated! User info: {'type': 'user', 'id': '66c85ffe7d9956c70e4e6b5d', 'name': 'ayshwaryaninet1', 'fullname': 'Shyam', 'email': 'ayshwaryaninet1@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/0591821f79c79ff8d50e97d4ac449afc.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'fine_tuning', 'role': 'write', 'createdAt': '2024-10-13T01:09:58.195Z'}}}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Data loading and preprocessing functions
def load_jsonl(path):
    with open(path, 'r') as file:
        return [json.loads(line) for line in file]

def format_unsloth_data(data):
    formatted_data = []
    for item in data:
        text = item['text']
        query_start = text.find("### Query:") + len("### Query:")
        response_start = text.find("### Response:") + len("### Response:")

        query = text[query_start:response_start - len("### Response:")].strip()
        response = text[response_start:].strip()

        prompt_id = hashlib.sha256(query.encode()).hexdigest()

        formatted_item = {
            "prompt": query,
            "prompt_id": prompt_id,
            "content": f"<|user|>{query}<|end|><|assistant|>{response}<|end|>"
        }
        formatted_data.append(formatted_item)
    return formatted_data

def collate_and_tokenize(examples, tokenizer, max_length):
    texts = [example['content'] for example in examples['data']]

    encoded = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    encoded['labels'] = encoded['input_ids'].clone()
    return encoded


In [None]:
def prepare_datasets(data_path, tokenizer, max_length=1024):
    try:
        data = load_jsonl(data_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {data_path} was not found. Please check the file path and try again.")

    if not data:
        raise ValueError(f"The file {data_path} is empty or could not be read properly.")

    # Use 90-10 split
    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

    train_data_formatted = format_unsloth_data(train_data)
    test_data_formatted = format_unsloth_data(test_data)

    train_dataset = Dataset.from_dict({"data": train_data_formatted})
    test_dataset = Dataset.from_dict({"data": test_data_formatted})

    print(f"Dataset size - Train: {len(train_dataset)}, Test: {len(test_dataset)}")

    # Tokenize datasets
    tokenized_train = train_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_test = test_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=test_dataset.column_names
    )

    return tokenized_train, tokenized_test


In [None]:
# Set HF_HOME
os.environ['HF_HOME'] = 'REDACTED'

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
# Print initial trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(model)


trainable params: 29884416 || all params: 2039024640 || trainable%: 1.47


In [None]:
# Prepare datasets
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

train_dataset, test_dataset = prepare_datasets("/content/drive/My Drive/combined_UnitOps_Training_ZAR (1).jsonl", tokenizer, max_length=1024)

Mounted at /content/drive
Dataset size - Train: 4370, Test: 1873


Map:   0%|          | 0/4370 [00:00<?, ? examples/s]

Map:   0%|          | 0/1873 [00:00<?, ? examples/s]

In [None]:
# Initialize wandb
wandb.init(project="Capstone", entity="23648727-the-university-of-western-australia")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Define a function to compute metrics like accuracy and perplexity
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels.flatten(), predictions.flatten())
    eval_loss = np.mean((logits - labels) ** 2)
    perplexity = math.exp(eval_loss) if eval_loss < 300 else float('inf')
    return {"accuracy": accuracy, "perplexity": perplexity}

torch.cuda.empty_cache()

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,      # Validation dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=250,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",  # Evaluate during training
        eval_steps=10,                # Evaluate every 10 steps
        save_steps=10,                # Save every 10 steps
    ),
)

# Train the model
import gc

# Manually clear GPU cache before training
torch.cuda.empty_cache()
gc.collect()

trainer.train()

# Evaluate the model
eval_metrics = torch.cuda.empty_cache()
gc.collect()

trainer.evaluate()
print(f"Evaluation metrics: {eval_metrics}")

# Finish wandb
wandb.finish()

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,370 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 250
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss,Validation Loss
10,1.1413,1.157136
20,1.1366,1.123016
30,1.2494,1.105886
40,1.1031,1.093922
50,1.1264,1.084736
60,1.0365,1.080619
70,1.0157,1.080522
80,1.153,1.074756
90,1.0312,1.070019
100,0.9456,1.066452


Evaluation metrics: None


VBox(children=(Label(value='0.028 MB of 0.028 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
eval/runtime,▇▆▃▂▁▁▅▃▄▄▃▂▄▅▅▄▃▄▅▅▅▆▅▅▆█
eval/samples_per_second,▂▃▆▇██▄▆▅▅▆▆▅▄▅▅▆▅▅▄▄▃▄▄▃▁
eval/steps_per_second,▁▃▆▆██▃▆▅▅▆▆▅▃▅▅▆▅▅▃▃▃▃▅▃▁
train/epoch,▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇███
train/global_step,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
train/grad_norm,▃█▃▃▂▄▄▄▂▂▁▂▁▂▁▂▁▂▁▁▁▁▂▂▂▁▁▂▁▁▂▂▁▁▁▁▂▂▁▁
train/learning_rate,▅██▇▄█▅▄▂▅▄███▇▇▇▇▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▁
train/loss,█▆▅▅▁▂▃▁▃▂▃▂▄▅▄▁▃▂▂▂▂▂▃▂▂▃▂▂▂▁▃▂▁▂▄▂▁▂▁▂

0,1
eval/loss,1.0415
eval/runtime,109.07
eval/samples_per_second,17.172
eval/steps_per_second,2.155
total_flos,4.611026976768e+16
train/epoch,0.45767
train/global_step,250.0
train/grad_norm,0.32284
train/learning_rate,0.0
train/loss,1.0506


In [None]:
!pip install huggingface_hub

from transformers import AutoTokenizer
from huggingface_hub import HfFolder
import logging

# Enable logging to help debug any potential issues
logging.basicConfig(level=logging.DEBUG)

# Step 1: Authenticate with Hugging Face API
# Replace "YOUR_HUGGING_FACE_TOKEN" with your Hugging Face token
api_token = #insertyourkey
HfFolder.save_token(api_token)

# Step 2: Push Model and Trainer to Hugging Face Hub
# Assuming that `model` and `trainer` have already been defined and trained

# Push the model to Hugging Face Hub
model.push_to_hub("ayshwaryaninet1/lora_model_10")

# Push the trainer to Hugging Face Hub
trainer.push_to_hub("ayshwaryaninet1/lora_model_10")

# Step 3: Load and Save Tokenizer
# Replace the path if your tokenizer has a different base path, such as "unsloth/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained("unsloth/Phi-3.5-mini-instruct", use_auth_token=True)
tokenizer.save_pretrained("ayshwaryaninet1/lora_model_10")






No files have been modified since last commit. Skipping to prevent empty commit.


Saved model to https://huggingface.co/ayshwaryaninet1/lora_model_1


No files have been modified since last commit. Skipping to prevent empty commit.


('ayshwaryaninet1/lora_model_1/tokenizer_config.json',
 'ayshwaryaninet1/lora_model_1/special_tokens_map.json',
 'ayshwaryaninet1/lora_model_1/tokenizer.model',
 'ayshwaryaninet1/lora_model_1/added_tokens.json',
 'ayshwaryaninet1/lora_model_1/tokenizer.json')