# Installing dependencies

In [8]:
%%capture
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [9]:
%%capture
!pip install --no-deps xformers trl peft accelerate bitsandbytes


In [10]:
%%capture
!pip install openvino-dev
!pip install datasets transformers optimum[graphcore]
!pip install optimum-intel
!pip install nncf
!pip install -q "openvino>=2023.1.0" onnx

# Load secrets

In [6]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
read_key = user_secrets.get_secret("huggingface") 
# wandb_key = user_secrets.get_secret("wandb")

# Login to huggingface

In [7]:
from huggingface_hub import login
login(token= read_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# import wandb

# # start a new wandb run to track this script
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="LLM finetunes",

#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 2e-4,
#     "dataset": "JefiRyan/mental-health-conversations",
#     "steps": 100,
#     }
# )

# Adding PEFT

In [11]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)


# Loading model and tokenizer

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=read_key)
# tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", config = bnb_config, token=read_key)
# model = OVModelForCausalLM.from_pretrained(model_id)


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

# Downloading, tokenizing and batching dataset

In [13]:
alpaca_prompt = """
You are a therapist and your name is AI-therapist. Your goal is to provide mental health support and counseling to users. Ensure that your responses are empathetic, supportive, and non-judgmental. Prioritize the user’s well-being and safety at all times.
Write a response that is appropriate for the input.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["Human"]
    outputs      = examples["Agent"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("JefiRyan/mental-health-conversations")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34836 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8710 [00:00<?, ? examples/s]

Map:   0%|          | 0/34836 [00:00<?, ? examples/s]

Map:   0%|          | 0/8710 [00:00<?, ? examples/s]

# Setting up the trainer

In [None]:
import transformers
from trl import SFTTrainer

wandb_run_name = ""

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False, 
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=4000,
#         num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir=wandb_run_name,
        optim="paged_adamw_8bit"
    ),
    peft_config = lora_config
)
trainer.train()


2024-06-11 05:48:40.088251: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 05:48:40.088356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 05:48:40.256210: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/34836 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8710 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,2.8337
2,3.2817
3,3.0345
4,2.7083
5,2.2094
6,1.7452
7,1.6281
8,1.8529
9,2.2618
10,1.3769




# Save fine-tuned model

In [None]:
new_model = "gemma-2b-it-ft" #Name of the model you will be pushing to huggingface model hub
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

# Note:
### You can't / sometimes unable to run cells after this because of memory issue (even cleaning cache won't always work). So now restart the session but don't forget to set persistence to files only

# Installing dependencies

In [None]:
%%capture
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
%%capture
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
%%capture
!pip install openvino-dev
!pip install datasets transformers optimum[graphcore]
!pip install optimum-intel
!pip install nncf
!pip install -q "openvino>=2023.1.0" onnx

# Imports

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

In [6]:
from peft import LoraConfig, PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Huggingface details

In [None]:
user_id = ""
repo_name = ""

# Merge fine-tuned model with base model

In [8]:
# Merge the model with LoRA weights
model_id = "google/gemma-2b-it"
new_model = "gemma-2b-it-ft"
merge_model_name = "merged_model"

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(new_model)

merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained(merge_model_name,safe_serialization=True)
tokenizer.save_pretrained(merge_model_name)

write_key = user_secrets.get_secret("hugging_write")

dir_path = f"{user_id}/{repo_name}"
merged_model.push_to_hub(dir_path, token = write_key) # Online saving
tokenizer.push_to_hub(dir_path, token = write_key) # Online saving

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

# Converting merged model to openvino model

In [9]:
from optimum.intel.openvino import OVModelForCausalLM

2024-06-11 05:25:42.150887: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 05:25:42.151008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 05:25:42.418567: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Loading and exporting model at same time with `export = True`

In [10]:
ov_model = OVModelForCausalLM.from_pretrained("merged_model", export = True)

Framework not specified. Using pt to export the model.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Can't determine type of OV quantization config. Please specify explicitly whether you intend to run weight-only quantization or not with `weight_only` parameter. Creating an instance of OVWeightQuantizationConfig.
The model weights will be quantized to int8.
Using framework PyTorch: 2.1.2
Overriding 1 configuration item(s)
	- use_cache -> True
  if sequence_length != 1:
  op1 = operator(*args, **kwargs)


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│              8 │ 100% (127 / 127)            │ 100% (127 / 127)                       │
┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


Output()

Compiling the model to CPU ...


# Inference with openvino model

In [11]:
tok = AutoTokenizer.from_pretrained(merge_model_name)

In [12]:
alpaca_prompt = """
You are a therapist and your name is AI-therapist. Your goal is to provide mental health support and counseling to users. Ensure that your responses are empathetic, supportive, and non-judgmental. Prioritize the user’s well-being and safety at all times.
Write a response that is appropriate for the input.

### Input:
{}

### Response:
{}"""

### Inference with streaming output

In [15]:
from transformers import TextStreamer

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "I'm very lonely", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt")


text_streamer = TextStreamer(tok)
_ = ov_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)