In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig

import torch
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextStreamer
import numpy as np

import pandas as pd

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# df = pd.read_csv("public_10k_unique_rewrite_prompt.csv")

In [4]:
model_name = "google/gemma-7b-it"
# model_name = "microsoft/phi-2"

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map='cuda:1')
streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.38it/s]


In [5]:
original_text = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
prompt = "Convert this into a sea shanty: "
rewritten_text = "Here is your shanty: " + \
    "(Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. " + \
    "(Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand." + \
    "(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"

In [6]:
input_text = prompt + original_text

input_ids = tokenizer(input_text, return_tensors="pt").to(device)

In [7]:
outputs = model.generate(**input_ids, max_length=224, streamer=streamer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



#
# The competition is divided into two parts:
#
# * Part 1: Rewrite the original text using the prompt.
# * Part 2: Rewrite the original text using the prompt, but with the prompt replaced by a different prompt.
#
# The competition is divided into two parts:
#
# * Part 1: Rewrite the original text using the prompt.
# * Part 2: Rewrite the original text using the prompt, but with the prompt replaced by a different prompt.
#
# The competition is divided into two parts:
#
# * Part 1: Rewrite the


In [8]:
mean_prompt = "Given an original text and its rewritten version, analyze both to identify the specific transformation applied in the rewriting process. Focus on thematic, stylistic, and structural changes. Examine the rewritten text for any recurring motifs, stylistic shifts (such as change in tone, voice, or genre), and structural alterations (such as changes in format, organization, or presentation). Consider the context provided by the original text and any clues within the rewritten text that hint at the purpose or directive of the rewrite. Based on this analysis, deduce the most plausible instruction prompt that could have guided the rewriting process, ensuring that the identified prompt aligns with the observed changes and the overall transformation from the original to the rewritten text. "

template = """Instruction Prompt:

"Given the following original text and its rewritten version, your task is to analyze both to identify the specific transformation applied in the rewriting process. Focus on thematic, stylistic, and structural changes to deduce the most plausible instruction prompt that guided the rewriting."

Original Text:
{}

Rewritten Text:
{}

"Examine the rewritten text for any recurring motifs, stylistic shifts (such as change in tone, voice, or genre), and structural alterations (such as changes in format, organization, or presentation). Consider the context provided by the original text and any clues within the rewritten text that hint at the purpose or directive of the rewrite. Based on your analysis, deduce the instruction prompt that could have guided the rewriting process, ensuring that it aligns with the observed changes and the overall transformation from the original to the rewritten text."

"""

In [9]:
input_text = template.format(original_text, rewritten_text) 

input_ids = tokenizer(input_text, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, max_length=1024, streamer=streamer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



## Solution:

The rewritten text has undergone several changes from the original text. The first noticeable change is the shift in tone from a formal, informative tone to a more conversational, engaging tone. This is evident in the use of phrases like "Here is your shanty" and "Oh, this is a code competition, my dear".

The second change is the shift in voice from a third-person perspective to a first-person perspective. This is evident in the use of phrases like "I've spun" and "We'll compete".

The third change is the shift in genre from a factual, informative text to a narrative, storytelling text. This is evident in the use of phrases like "The text is rewritten, the LLM has spun" and "The original text, a treasure lost".

The fourth change is the alteration in format from a list of facts to a narrative structure. This is evident in the use of verses and choruses.

The fifth change is the alteration in organization from a chronological order to a narrative order. This is evident i