In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import sys
import site
import os

# Install the required packages
!{sys.executable} -m pip install --upgrade  "transformers>=4.38.*"
!{sys.executable} -m pip install --upgrade  "datasets>=2.18.*"
!{sys.executable} -m pip install --upgrade "wandb>=0.16.*"
!{sys.executable} -m pip install --upgrade "trl>=0.7.11"
!{sys.executable} -m pip install --upgrade "peft>=0.9.0"
!{sys.executable} -m pip install --upgrade "accelerate>=0.28.*"

# Get the site-packages directory
site_packages_dir = site.getsitepackages()[0]

# add the site pkg directory where these pkgs are insalled to the top of sys.path
if not os.access(site_packages_dir, os.W_OK):
    user_site_packages_dir = site.getusersitepackages()
    if user_site_packages_dir in sys.path:
        sys.path.remove(user_site_packages_dir)
    sys.path.insert(0, user_site_packages_dir)
else:
    if site_packages_dir in sys.path:
        sys.path.remove(site_packages_dir)
    sys.path.insert(0, site_packages_dir)

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import warnings
warnings.filterwarnings("ignore")

import os
import psutil

num_physical_cores = psutil.cpu_count(logical=False)
num_cores_per_socket = num_physical_cores // 2

os.environ["TOKENIZERS_PARALLELISM"] = "0"
#HF_TOKEN = os.environ["HF_TOKEN"]

# Set the LD_PRELOAD environment variable
ld_preload = os.environ.get("LD_PRELOAD", "")
conda_prefix = os.environ.get("CONDA_PREFIX", "")
# Improve memory allocation performance, if tcmalloc is not available, please comment this line out
os.environ["LD_PRELOAD"] = f"{ld_preload}:{conda_prefix}/lib/libtcmalloc.so"
# Reduce the overhead of submitting commands to the GPU
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
# reducing memory accesses by fusing SDP ops
os.environ["ENABLE_SDP_FUSION"] = "1"
# set openMP threads to number of physical cores
os.environ["OMP_NUM_THREADS"] = str(num_physical_cores)
# Set the thread affinity policy
os.environ["OMP_PROC_BIND"] = "close"
# Set the places for thread pinning
os.environ["OMP_PLACES"] = "cores"

print(f"Number of physical cores: {num_physical_cores}")
print(f"Number of cores per socket: {num_cores_per_socket}")
print(f"OpenMP environment variables:")
print(f"  - OMP_NUM_THREADS: {os.environ['OMP_NUM_THREADS']}")
print(f"  - OMP_PROC_BIND: {os.environ['OMP_PROC_BIND']}")
print(f"  - OMP_PLACES: {os.environ['OMP_PLACES']}")

Number of physical cores: 112
Number of cores per socket: 56
OpenMP environment variables:
  - OMP_NUM_THREADS: 112
  - OMP_PROC_BIND: close
  - OMP_PLACES: cores


In [4]:
import asyncio
import threading
import torch
from IPython.display import display, HTML

import torch
import intel_extension_for_pytorch as ipex

if torch.xpu.is_available():
    torch.xpu.empty_cache()
    
    def get_memory_usage():
        memory_reserved = round(torch.xpu.memory_reserved() / 1024**3, 3)
        memory_allocated = round(torch.xpu.memory_allocated() / 1024**3, 3)
        max_memory_reserved = round(torch.xpu.max_memory_reserved() / 1024**3, 3)
        max_memory_allocated = round(torch.xpu.max_memory_allocated() / 1024**3, 3)
        return memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated
   
    def print_memory_usage():
        device_name = torch.xpu.get_device_name()
        print(f"XPU Name: {device_name}")
        memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
        memory_usage_text = f"XPU Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
        print(f"\r{memory_usage_text}", end="", flush=True)
    
    async def display_memory_usage(output):
        device_name = torch.xpu.get_device_name()
        output.update(HTML(f"<p>XPU Name: {device_name}</p>"))
        while True:
            memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
            memory_usage_text = f"XPU ({device_name}) :: Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
            output.update(HTML(f"<p>{memory_usage_text}</p>"))
            await asyncio.sleep(5)
    
    def start_memory_monitor(output):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.create_task(display_memory_usage(output))
        thread = threading.Thread(target=loop.run_forever)
        thread.start()    
    output = display(display_id=True)
    start_memory_monitor(output)
else:
    print("XPU device not available.")

XPU device not available.


In [5]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    # could use q, v and 0 projections as well and comment out the rest
    target_modules=["q_proj", "o_proj", 
                    "v_proj", "k_proj", 
                    "gate_proj", "up_proj",
                    "down_proj"],
    task_type="CAUSAL_LM")



In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

USE_CPU = False
device = "xpu:0" if torch.xpu.is_available() else "cpu"
if USE_CPU:
    device = "cpu"
print(f"using device: {device}")

model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
# Set padding side to the right to ensure proper attention masking during fine-tuning
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
# Disable caching mechanism to reduce memory usage during fine-tuning
model.config.use_cache = False
# Configure the model's pre-training tensor parallelism degree to match the fine-tuning setup
model.config.pretraining_tp = 1 

using device: cpu


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def generate_response(model, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)    
    outputs = model.generate(input_ids, max_new_tokens=100,
                             eos_token_id=tokenizer.eos_token_id)    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def test_model(model, test_inputs):
    """quickly test the model using queries."""
    for input_text in test_inputs:
        print("__"*25)
        generated_response = generate_response(model, input_text)
        print(f"{input_text}")
        print(f"Generated Answer: {generated_response}\n")
        print("__"*25)

test_inputs = [
"Write a story about a futuristic city where robots and humans coexist, focusing on the friendship between a young girl and her robot companion.",
"Craft a mystery story set in a Victorian-era mansion, involving a secret society and a missing heirloom.",
"Develop a thriller that takes place during a severe snowstorm in a remote mountain cabin, with characters who discover they are not alone.",
"Compose a fantasy tale about a kingdom threatened by a mythical creature, seen through the eyes of a young apprentice wizard tasked with saving the realm.",
"Create a romantic story set on a Mediterranean cruise, where two strangers from different backgrounds fall in love while dealing with personal challenges."
]

print("Testing the model before fine-tuning:")
test_model(model, test_inputs)

Testing the model before fine-tuning:
__________________________________________________
Write a story about a futuristic city where robots and humans coexist, focusing on the friendship between a young girl and her robot companion.
Generated Answer: Write a story about a futuristic city where robots and humans coexist, focusing on the friendship between a young girl and her robot companion.

Answer:

Step 1/2
Once upon a time, there was a young girl named Sarah who lived in a futuristic city. The city was filled with advanced technology and robots that were used for various purposes. Sarah was a curious girl who loved to explore and learn about the world around her. One day, Sarah was walking through the city when she saw a robot standing on a street corner. The robot was wearing a blue jumpsuit and had a friendly smile on its face. Sarah approached

__________________________________________________
__________________________________________________
Craft a mystery story set in a Vic

In [8]:
from datasets import load_dataset

dataset_name = "PocketDoc/RUCAIBox-Story-Generation-Alpaca"
dataset = load_dataset(dataset_name ,split="train")

2024-04-28 10:44:00,276 - datasets - INFO - PyTorch version 2.2.2 available.


In [9]:
print(dataset[0])

print(f"Instruction is: {dataset[0]['instruction']}")
print(f"Response is: {dataset[0]['output']}")


print(f"Number of examples in the dataset: {len(dataset)}")
print(f"Fields in the dataset: {list(dataset.features.keys())}")

{'instruction': 'Your story should involve "My dog was diagnosed with congestive heart failure. He hung on for 9 months. We eventually had to put him to sleep." in a brief, concise manner.', 'input': '', 'output': 'It was such a sad time. We had to put him to sleep right before my daughters high school graduation. We had company coming in to town for graduation. After the dog passed away we were all so sad and did not have a lot of energy to get the house ready for company. The dog was my best friend. He was so sweet and kind and loving. He followed me everywhere. He loved to cuddle with me on the couch and put his head on the backs of my knees. It was such a soothing experience being cuddled up like that with him. When we discovered he was dying we found a puppy from a little girl on craigslist. She could not take care of the puppy and happily sold him to us. The puppy was great for the dying dog. It gave the dog a purpose and something to focus on, training the new guy. At first the 

In [10]:
def format_prompts(batch):
    formatted_prompts = []
    for instruction, user_input, response in zip(batch["instruction"], batch['input'], batch["output"]):
        prompt = f"Instruction:\n{instruction}\n\n{user_input}\n\nResponse:\n{response}"
        formatted_prompts.append(prompt)
    return {"text": formatted_prompts}

dataset = dataset.map(format_prompts, batched=True)
split_dataset = dataset.train_test_split(test_size=0.2, seed=99)
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

In [11]:
import transformers
import wandb

from trl import SFTTrainer

os.environ["WANDB_PROJECT"] = "gemma_storytelling"  
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["IPEX_TILE_AS_DEVICE"] = "1"

finetuned_model_id = "Maelstrome/gemma-2b-storytelling"
PUSH_TO_HUB = True
USE_WANDB = False

# Calculate max_steps based on the subset size
num_train_samples = len(train_dataset)
batch_size = 4
gradient_accumulation_steps = 8
steps_per_epoch = num_train_samples // (batch_size * gradient_accumulation_steps)
num_epochs = 1
max_steps = steps_per_epoch * num_epochs
print(f"Finetuning for max number of steps: {max_steps}")



training_args = transformers.TrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_ratio=0.05,
        max_steps=max_steps,
        learning_rate=1e-5,
        evaluation_strategy="steps",
        save_steps=500,
        bf16=True,
        logging_steps=100,
        output_dir=finetuned_model_id,
        hub_model_id=finetuned_model_id if PUSH_TO_HUB else None,
        use_ipex=True,
        report_to="wandb" if USE_WANDB else None,
        push_to_hub=PUSH_TO_HUB,
        max_grad_norm=0.6,
        weight_decay=0.01,
        group_by_length=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    packing=True
)

if device != "cpu":
    print_memory_usage()
    torch.xpu.empty_cache()
results = trainer.train()


Finetuning for max number of steps: 154


max_steps is given, it will override any value given in num_train_epochs
2024-04-28 10:44:10,876 - wandb.jupyter - ERROR - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
[34m[1mwandb[0m: Currently logged in as: [33mbillzhangsc[0m ([33mbillzhang-25[0m). Use [1m`wandb login --relogin`[0m to force relogin
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


Step,Training Loss,Validation Loss
100,1454737970954.24,


max_steps is given, it will override any value given in num_train_epochs
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


Time:  23659.11
Samples/second:  0.21


NameError: name 'get_memory_usage' is not defined

In [12]:
def print_training_summary(results):
    print(f"Time: {results.metrics['train_runtime']: .2f}")
    print(f"Samples/second: {results.metrics['train_samples_per_second']: .2f}")

print_training_summary(results)
wandb.finish()

# save lora model
tuned_lora_model = "gemma-2b-storytelling-lora"
trainer.model.save_pretrained(tuned_lora_model)

Time:  23659.11
Samples/second:  0.21


VBox(children=(Label(value='166.436 MB of 166.436 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁█
train/global_step,▁▁█
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,
eval/runtime,767.2118
eval/samples_per_second,1.125
eval/steps_per_second,0.141
total_flos,3.05785512394752e+16
train/epoch,1.41123
train/global_step,154.0
train/grad_norm,
train/learning_rate,0.0
train/loss,1454737970954.24


In [20]:
from peft import PeftModel

tuned_model = "gemma-2b"

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
)

model = PeftModel.from_pretrained(base_model, tuned_lora_model)
model = model.merge_and_unload()
# save final tuned model
model.save_pretrained(tuned_model)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
#model2 = ipex.optimize_transformers(model)  # optimize the model using `ipex`

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
test_inputs = [
"Write a story about a futuristic city where robots and humans coexist, focusing on the friendship between a young girl and her robot companion.",
"Craft a mystery story set in a Victorian-era mansion, involving a secret society and a missing heirloom.",
"Develop a thriller that takes place during a severe snowstorm in a remote mountain cabin, with characters who discover they are not alone.",
"Compose a fantasy tale about a kingdom threatened by a mythical creature, seen through the eyes of a young apprentice wizard tasked with saving the realm.",
"Create a romantic story set on a Mediterranean cruise, where two strangers from different backgrounds fall in love while dealing with personal challenges."
]
for text in test_inputs:
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=200, 
                             do_sample=False, top_k=100,temperature=0.1, 
                             eos_token_id=tokenizer.eos_token_id)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Write a story about a futuristic city where robots and humans coexist, focusing on the friendship between a young girl and her robot companion.
Craft a mystery story set in a Victorian-era mansion, involving a secret society and a missing heirloom.
Develop a thriller that takes place during a severe snowstorm in a remote mountain cabin, with characters who discover they are not alone.
Compose a fantasy tale about a kingdom threatened by a mythical creature, seen through the eyes of a young apprentice wizard tasked with saving the realm.
Create a romantic story set on a Mediterranean cruise, where two strangers from different backgrounds fall in love while dealing with personal challenges.


In [22]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/Maelstrome/gemma-2b-storytelling/commit/dfd430c5f796818d23acaa38a588fb118bc040c7', commit_message='End of training', commit_description='', oid='dfd430c5f796818d23acaa38a588fb118bc040c7', pr_url=None, pr_revision=None, pr_num=None)