In [1]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# # install flash-attn
# !pip install ninja packaging
# !MAX_JOBS=12 pip install flash-attn --no-build-isolation --upgrade

In [2]:
from huggingface_hub import login

login(
  token="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", # ADD YOUR TOKEN HERE
  add_to_git_credential=False
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/me/.cache/huggingface/token
Login successful


In [3]:
import pandas as pd
from datasets import Dataset


template = """Instruction:\nBelow, the `Original Text` passage has been rewritten/transformed/improved into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt/instruction. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt or instruction that was likely given to the LLM to rewrite/transform/improve the text in this way.\n\nOriginal Text:\n{original_text}\n\nRewriten Text:\n{rewritten_text}\n\nResponse:\n{rewrite_prompt}"""

# def format_dataset(example):
#     formatted_input = template.format(
#         original_text=example['original_text'],
#         rewritten_text=example['rewritten_text'],
#         rewrite_prompt=example['rewrite_prompt']  # This will not be included in the model input
#     )
#     # Note: 'rewrite_prompt' is not directly used in formatted_input since it's the target
#     # If your model setup requires, you might separate 'formatted_input' and 'rewrite_prompt' here
#     return {
#         "input_text": formatted_input,  # This now includes your entire template with placeholders filled
#         "target_text": example['rewrite_prompt']  # Target for prediction
#     }

# Step 2: Load your CSV file into a DataFrame
df = pd.read_csv('merged_dataset.csv')

In [4]:
df["prompt"] = df.apply(lambda row: template.format(original_text=row.original_text,
                                                             rewritten_text=row.rewritten_text,
                                                             rewrite_prompt=row.rewrite_prompt), axis=1)
data = df.prompt.tolist()

In [9]:
df['prompt']

0        Instruction:\nBelow, the `Original Text` passa...
1        Instruction:\nBelow, the `Original Text` passa...
2        Instruction:\nBelow, the `Original Text` passa...
3        Instruction:\nBelow, the `Original Text` passa...
4        Instruction:\nBelow, the `Original Text` passa...
                               ...                        
18337    Instruction:\nBelow, the `Original Text` passa...
18338    Instruction:\nBelow, the `Original Text` passa...
18339    Instruction:\nBelow, the `Original Text` passa...
18340    Instruction:\nBelow, the `Original Text` passa...
18341    Instruction:\nBelow, the `Original Text` passa...
Name: prompt, Length: 18342, dtype: object

In [11]:
# Step 3: Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

train_test_split = dataset.train_test_split(test_size=0.2)
dataset_dict = {"train": train_test_split['train'], "test": train_test_split['test']}

In [12]:
dataset

Dataset({
    features: ['original_text', 'rewrite_prompt', 'rewritten_text', 'prompt'],
    num_rows: 18342
})

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Hugging Face model id
model_name = "google/gemma-7b-it"
# tokenizer_id = "philschmid/gemma-tokenizer-chatml"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          add_eos_token=True,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=8,
        lora_dropout=0.05,
        r=6,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM", 
)

In [15]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="gemma-7b-finetuned", # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=2,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    report_to="none",                       # do not report to huggingface hub
)

In [17]:
from trl import SFTTrainer

max_seq_length = 512 

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_text_field = 'prompt',
    dataset_kwargs={
        "add_special_tokens": False, # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

Generating train split: 0 examples [00:00, ? examples/s]



In [18]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmehrdad-jahanbanifard[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/11352 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


KeyboardInterrupt: 