In [None]:
!pip install -U bitsandbytes
!pip install -U peft
!pip install -U transformers
!pip install -U trl
!pip install -U scikit-learn
!pip install -U sentencepiece
!pip install -U protobuf

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig

In [None]:
exp_name = "gemma"
data_path = "25k_utext_uprompt.csv"
model_path = "google/gemma-1.1-7b-it"
output_path = f"outputs"
model_save_path = f"{exp_name}_adapter"

In [None]:
epochs = 3
batch_size = 1 
max_seq_length = 1536
lr = 2e-4
access_token = "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"

In [None]:
df = pd.read_csv(data_path)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path, token=access_token)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto",
    token=access_token
)
print(model)

In [None]:
model.config.gradient_checkpointing = False

In [None]:
def token_len(text):
    tokenized = tokenizer(text, return_length=True)
    length = tokenized["length"][0]
    return length

In [None]:
prompt = """
<bos><start_of_turn>user
Given are two texts, the Rewritten Text was rewritten from the Original Text by using large language model and a sentence of prompt. You are trying to understand how the Original Text was transformed into the Rewritten Text.
Original Text: {}
Rewritten Text: {}
You should analyze the changes in style, tone, structure, content, etc. Come up with a prompt that must have been used to guide the transformation from the Original Text to the Rewritten Text. Now return the prompt ONLY in one sentence.<end_of_turn>
<start_of_turn>model
Prompt: {}
"""

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["rewritten_text"])):
        ori_text = example["original_text"][i]
        rew_text = example["rewritten_text"][i]
        rew_prompt = example["rewrite_prompt"][i]
        text = prompt.format(ori_text, rew_text, rew_prompt)
        if token_len(text) > max_seq_length:
            continue
        output_texts.append(text)
    return output_texts

In [None]:
response_template = "Prompt:"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

In [None]:
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj"
    ]
)

In [None]:
args = TrainingArguments(
    output_dir=output_path,
    fp16=True,
    learning_rate=lr,
    optim="paged_adamw_8bit",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=5,
    logging_steps=200,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.001,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=args,
    max_seq_length=max_seq_length,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    peft_config=peft_config,
    packing=False
)

In [None]:
torch.cuda.empty_cache()
trainer.train()

In [None]:
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)