Install and Load libraries

In [1]:
#Install the required packages for this project
!pip install transformers datasets bitsandbytes accelerate peft
!pip install scikit-learn
!pip install torch --upgrade
!pip install evaluate
!pip install flash-attn
!pip install wandb
!pip install logging
!pip install huggingface-hub

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.0 MB/s[0m et

In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, TrainerCallback
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import hashlib
import random
import evaluate
import numpy as np
from huggingface_hub import notebook_login
import time
import math
import warnings
import wandb
import logging
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.utils.checkpoint")
from torch.utils.data import DataLoader

In [3]:
from huggingface_hub import login

# Replace 'your_access_token_here' with your actual access token
login(token=#insertyourkey)

# Verify login
from huggingface_hub import HfApi

api = HfApi()

try:
    user_info = api.whoami()
    print("Successfully authenticated! User info:", user_info)
except Exception as e:
    print("Authentication error:", e)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Successfully authenticated! User info: {'type': 'user', 'id': '66b61ffb589b4e3f6e045c15', 'name': 'ShilpaSandhya', 'fullname': 'Shilpa Sandhya', 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/c676934935390ea9d7a0903a62fc794c.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'phi-3Model', 'role': 'fineGrained', 'createdAt': '2024-09-17T00:24:02.570Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['inference.serverless.write', 'discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '66b61ffb589b4e3f6e045c15', 'type': 'user', 'name': 'ShilpaSandhya'}, 'permissions': ['repo.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Data loading and preprocessing functions
def load_jsonl(path):
    with open(path, 'r') as file:
        return [json.loads(line) for line in file]

def format_ultrachat_data(data):
    formatted_data = []
    for item in data:
        text = item['text']
        query_start = text.find("### Query:") + len("### Query:")
        response_start = text.find("### Response:") + len("### Response:")
        references_start = text.find("### References:") + len("### References:")

        query = text[query_start:response_start - len("### Response:")].strip()
        response = text[response_start:references_start - len("### References:")].strip()

        prompt_id = hashlib.sha256(query.encode()).hexdigest()

        formatted_item = {
            "prompt": query,
            "prompt_id": prompt_id,
            "messages": [
                {"content": query, "role": "user"},
                {"content": response, "role": "assistant"}
            ]
        }
        formatted_data.append(formatted_item)
    return formatted_data

def collate_and_tokenize(examples, tokenizer, max_length):
    texts = [" ".join([msg['content'] for msg in example['messages']]) for example in examples['data']]

    encoded = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    encoded['labels'] = encoded['input_ids'].clone()
    return encoded

def prepare_datasets(data_path, tokenizer, max_length=2048):
    try:
        data = load_jsonl(data_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {data_path} was not found. Please check the file path and try again.")

    if not data:
        raise ValueError(f"The file {data_path} is empty or could not be read properly.")

    # Use 70-30 split
    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

    train_data_formatted = format_ultrachat_data(train_data)
    test_data_formatted = format_ultrachat_data(test_data)

    train_dataset = Dataset.from_dict({"data": train_data_formatted})
    test_dataset = Dataset.from_dict({"data": test_data_formatted})

    print(f"Dataset size - Train: {len(train_dataset)}, Test: {len(test_dataset)}")

    # Tokenize datasets
    tokenized_train = train_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_test = test_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=test_dataset.column_names
    )

    return tokenized_train, tokenized_test

Load base model

In [5]:
# Set HF_HOME
os.environ['HF_HOME'] = 'REDACTED'

model_name = "microsoft/Phi-3.5-mini-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          add_eos_token=True,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
# Define LoRA Config
target_modules = []
for i in range(10):  # Phi-3.5-mini has 10 layers
    target_modules.extend([
        f'model.layers.{i}.self_attn.o_proj',
        f'model.layers.{i}.self_attn.qkv_proj',
        f'model.layers.{i}.mlp.gate_up_proj',
        f'model.layers.{i}.mlp.down_proj',
    ])

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get PEFT model
lora_model = get_peft_model(model, config)

# Print initial trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(lora_model)

# Prepare datasets
train_dataset, test_dataset = prepare_datasets("combined_UnitOps_Training_ZAR.jsonl", tokenizer, max_length=2048)

trainable params: 15728640 || all params: 3836808192 || trainable%: 0.41
Dataset size - Train: 4370, Test: 1873


Map:   0%|          | 0/4370 [00:00<?, ? examples/s]

Map:   0%|          | 0/1873 [00:00<?, ? examples/s]

In [9]:
import transformers
# Initialize wandb
wandb.init(project="CapstoneProject", entity="shilpasandhya-s229-university-of-western-australia")
logging.basicConfig(level=logging.INFO)

training_args = TrainingArguments(
    output_dir="./phi3_5_mini_instruct_lora_chemical_eng_flash",
    run_name=f"phi3-5-mini-instruct-lora-run-flash-{time.strftime('%Y%m%d-%H%M%S')}",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    optim="adamw_torch",
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=10,
    eval_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    fp16=True,
    fp16_full_eval=True,
    max_grad_norm=0.3,
    report_to=["wandb"],
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3
)

class DetailedLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero:
            if 'loss' in logs:
                wandb.log({"train_loss": logs['loss'], "step": state.global_step})
            if 'eval_loss' in logs:
                wandb.log({"eval_loss": logs['eval_loss'], "step": state.global_step})
                perplexity = math.exp(logs['eval_loss'])
                wandb.log({"perplexity": perplexity, "step": state.global_step})

            # Log memory usage
            memory_used = torch.cuda.memory_allocated() / 1e9  # Convert to GB
            wandb.log({"memory_used_gb": memory_used, "step": state.global_step})

def data_collator(examples):
    return tokenizer.pad(examples, padding=True, return_tensors="pt")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    test_dataset,
    batch_size=4,
    collate_fn=data_collator
)

trainer = transformers.Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    callbacks=[early_stopping_callback, DetailedLoggingCallback()],
)

# Disable cache to prevent warning, re-enable for inference
model.config.use_cache = False

# Efficiency metrics
start_time = time.time()
start_memory = torch.cuda.memory_allocated()
trainer.train()
end_time = time.time()
end_memory = torch.cuda.memory_allocated()

training_time = end_time - start_time
memory_used = end_memory - start_memory

# Performance evaluation
eval_results = trainer.evaluate()

print(f"Training Time: {training_time:.2f} seconds")
print(f"Memory Used: {memory_used / 1e9:.2f} GB")
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Add this after training
wandb.finish()
# WANDB Key: 36d1a400cae0bcea6edccb8d15883f422b90a00e

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
100,0.0877,0.087381
200,0.0785,0.081177
300,0.0763,0.079947
400,0.0782,0.079291
500,0.0764,0.079466
600,0.0683,0.079829
700,0.0717,0.080465


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training Time: 10142.73 seconds
Memory Used: 7.85 GB
Perplexity: 1.08


VBox(children=(Label(value='0.031 MB of 0.031 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▃▂▁▁▁▂▁
eval/runtime,▆██▇█▇▆▁
eval/samples_per_second,▃▁▁▂▁▂▃█
eval/steps_per_second,▃▁▁▂▁▂▃█
eval_loss,█▃▂▁▁▁▂▁
memory_used_gb,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perplexity,█▃▂▁▁▁▂▁
step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇███
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇██

0,1
eval/loss,0.07929
eval/runtime,215.7865
eval/samples_per_second,8.68
eval/steps_per_second,2.173
eval_loss,0.07929
memory_used_gb,7.81796
perplexity,1.08252
step,700.0
total_flos,1.0285142852291788e+18
train/epoch,5.12351


In [10]:
lora_model.push_to_hub("ShilpaSandhya/phi3_5_mini_lora_chemical_eng_flash")
trainer.push_to_hub("ShilpaSandhya/phi3_5_mini_lora_chemical_eng_flash")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ShilpaSandhya/phi3_5_mini_instruct_lora_chemical_eng_flash/commit/f5d77b35ddd4a5bb174af37ddec9aa05a4731b4a', commit_message='ShilpaSandhya/phi3_5_mini_lora_chemical_eng_flash', commit_description='', oid='f5d77b35ddd4a5bb174af37ddec9aa05a4731b4a', pr_url=None, pr_revision=None, pr_num=None)

In [12]:

# Example of generating text with the fine-tuned model
input_text = "What is ideal gas law?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(lora_model.device)
with torch.no_grad():
    outputs = model.generate(input_ids, max_new_tokens=200, temperature=0.7, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

What is ideal gas law? What is the significance of the critical point in the context of phase diagrams and thermodynamics? The critical point on a phase diagram represents the highest temperature and pressure at which a substance can exist as a liquid and a vapor in equilibrium. Beyond this point, the substance exists as a supercritical fluid, where it exhibits properties of both liquids and gases. This concept is crucial in thermodynamics and phase diagrams because it defines the boundary between distinct phases and helps in understanding the behavior of substances under various conditions. It's particularly important in processes like supercritical fluid extraction, where the unique properties of supercritical fluids are exploited for efficient separation and extraction processes.
