In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextStreamer
import numpy as np

import pandas as pd

from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
original_text = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
prompt = "Convert this into a sea shanty: "
rewritten_text = "Here is your shanty: " + \
    "(Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. " + \
    "(Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand." + \
    "(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"

In [3]:
model_name = "google/gemma-7b-it"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)
sentenceTF = SentenceTransformer('sentence-transformers/sentence-t5-base')
streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)

def count_tokens(text):
    return len(tokenizer(text)["input_ids"])

def sim(s, k, p=3):
    sign = np.sign(np.dot(s, k))
    s_norm = np.linalg.norm(s)
    k_norm = np.linalg.norm(k)

    abs_dot_product = np.abs(np.dot(s, k)) ** p
    result = sign * (abs_dot_product / (s_norm * k_norm))
    return result

def get_sim(text1, text2):
    embeddings = sentenceTF.encode([text1, text2])
    return sim(embeddings[0], embeddings[1])

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


In [42]:
input_text = prompt + original_text
# input_text = f"What is the prompt that changed the original text to the rewriten text? \n Here is the original text ``` {original_text} ``` \nHere is the rewritten_text ```{rewritten_text}```"
# input_text = f"You are an expert prompt reverse engineer. Give me as details as you can about what the text seems like: ```{rewritten_text}```"
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

In [43]:
# 224 seems to be generated max tokens

In [44]:
outputs = model.generate(**input_ids, max_length=224, streamer=streamer)
# print(tokenizer.decode(outputs[0]))
# count_tokens(tokenizer.decode(outputs[0]).replace('\n', ''))



**Shanty:**

(Verse 1) The text has been rewritten, a tale spun anew,
By the Gemma LLM, with a rewrite prompt too.
The goal is to find, the prompt that was used,
To unlock the secrets, hidden in the text.

(Chorus) Oh, me hearties, let's sing along,
The competition's on, where we belong.
With code and cunning, we'll crack the code,
And find the prompt, with all our might.

(Verse 2) The text passages, they dance and flow,
But the prompt remains


In [49]:
generated = tokenizer.decode(outputs[:, input_ids["input_ids"].shape[1]:][0])
generated

"\n\n**Shanty:**\n\n(Verse 1) The text has been rewritten, a tale spun anew,\nBy the Gemma LLM, with a rewrite prompt too.\nThe goal is to find, the prompt that was used,\nTo unlock the secrets, hidden in the text.\n\n(Chorus) Oh, me hearties, let's sing along,\nThe competition's on, where we belong.\nWith code and cunning, we'll crack the code,\nAnd find the prompt, with all our might.\n\n(Verse 2) The text passages, they dance and flow,\nBut the prompt remains"

In [50]:
get_sim(generated, rewritten_text)

0.8815261577554485

In [16]:
df = pd.read_csv("public_10k_unique_rewrite_prompt.csv")

In [41]:
df.rewrite_prompt.to_csv("rewrite.csv")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig

In [3]:
exp_name = 'phi2_public_data_sft'
data_path = 'public_10k_unique_rewrite_prompt.csv'
model_path = "microsoft/phi-2"
output_path = f'outputs'
model_save_path =  f'{exp_name}_adapter'

In [4]:
epochs=5
batch_size=1 # 2 
max_seq_length=512 # 1024 
lr = 1e-4

In [5]:
df = pd.read_csv(data_path)
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    )
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype='float16',
        bnb_4bit_use_double_quant=False,
    )

model = AutoModelForCausalLM.from_pretrained(model_path,
                                             quantization_config=bnb_config,
                                             trust_remote_code=True,
                                             use_auth_token=True)
model.config.gradient_checkpointing = False

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards: 100%|██████████| 2/2 [03:12<00:00, 96.50s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s]


In [7]:
def token_len(text):
    tokenized = tokenizer(text, return_length=True)
    length = tokenized['length'][0]
    return length

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['rewritten_text'])):
        ori_text = example['original_text'][i]
        rew_text = example['rewritten_text'][i]
        rew_prompt = example['rewrite_prompt'][i]
        text = f"Instruct: Original Text:{ori_text}\nRewritten Text:{rew_text}\nWrite a prompt that was likely given to the LLM to rewrite original text into rewritten text.Output: {rew_prompt}"
        if token_len(text) > max_seq_length:
            continue
        output_texts.append(text)
    return output_texts

In [8]:
response_template = "Output:"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, 
                                           tokenizer=tokenizer)

In [9]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj", "k_proj", "v_proj", "dense"],
)

In [10]:
args = TrainingArguments(
    output_dir = output_path,
    fp16=True,
    learning_rate=lr,
    optim="adafactor",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=50,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    report_to='none',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    )

In [11]:
trainer = SFTTrainer(
    model=model,
    args = args,
    max_seq_length=max_seq_length,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    peft_config=peft_config,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map:   0%|          | 0/7399 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2159 > 2048). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 7399/7399 [00:05<00:00, 1247.09 examples/s]
Map: 100%|██████████| 3172/3172 [00:02<00:00, 1087.06 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,1.2282,1.134236
1,1.0357,1.000741
2,0.8866,0.953702
3,0.836,0.937026
4,0.8438,0.934122




TrainOutput(global_step=1320, training_loss=1.059759164578987, metrics={'train_runtime': 7885.3166, 'train_samples_per_second': 2.679, 'train_steps_per_second': 0.167, 'total_flos': 1.309273021254144e+17, 'train_loss': 1.059759164578987, 'epoch': 5.0})

In [14]:
model_save_path = "phi2"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('phi2/tokenizer_config.json',
 'phi2/special_tokens_map.json',
 'phi2/vocab.json',
 'phi2/merges.txt',
 'phi2/added_tokens.json',
 'phi2/tokenizer.json')