In [None]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/train.csv')
# train_df.head()

In [None]:
# train_df.loc[0,'original_text']

In [None]:
# train_df.loc[0,'rewrite_prompt']

In [None]:
# train_df.loc[0,'rewritten_text']

In [None]:
!mkdir /kaggle/working/gemma/
!cp /kaggle/input/gemma-pytorch/gemma_pytorch-main/gemma/* /kaggle/working/gemma/

In [None]:
!pip install --no-index --no-deps /kaggle/input/immutabledict/immutabledict-4.1.0-py3-none-any.whl

In [None]:
import sys 
sys.path.append("/kaggle/working/") 
from gemma.config import GemmaConfig, get_config_for_7b, get_config_for_2b
from gemma.model import GemmaForCausalLM
from gemma.tokenizer import Tokenizer
import contextlib
import os
import torch
# Load the model
VARIANT = "2b-it" 
MACHINE_TYPE = "cuda" 
weights_dir = '/kaggle/input/gemma/pytorch/2b-it/2'

@contextlib.contextmanager
def _set_default_tensor_type(dtype: torch.dtype):
  """Sets the default torch dtype to the given dtype."""
  torch.set_default_dtype(dtype)
  yield
  torch.set_default_dtype(torch.float)

# Model Config.
model_config = get_config_for_2b() if "2b" in VARIANT else get_config_for_7b()
model_config.tokenizer = os.path.join(weights_dir, "tokenizer.model")
model_config.quant = "quant" in VARIANT

# Model.
device = torch.device(MACHINE_TYPE)
with _set_default_tensor_type(model_config.get_dtype()):
  model = GemmaForCausalLM(model_config)
  ckpt_path = os.path.join(weights_dir, f'gemma-{VARIANT}.ckpt')
  model.load_weights(ckpt_path)
  model = model.to(device).eval()

In [None]:
import random
random.seed(0)
# This is the prompt format the model expects
USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"

In [None]:
# prompt_for_llm = (
#     "<start_of_turn>user\nYou are a smart linguist and you are solving a puzzle.You need to generate a rewrite_prompt that effectively transforms the given original_text into the provided rewritten_text."
#     "Capture the essence,tone,style,and context of the content while improving the language, coherence, and expressiveness."
#     "Pay attention to detail, clarity, and overall quality in your generated rewrite_prompt."
#     "Here is an example sample: original text-" + train_df.loc[0, 'original_text'] +
#     "rewritten_text-" + train_df.loc[0, 'rewritten_text'] +
#     "and this is the right rewrite_prompt-" + train_df.loc[0, 'rewrite_prompt'] +
#     "Now, You will output in text the most suitable rewrite_prompt. For the given original_text- {ot}" +
#     "and rewritten_text- {rt}" +
#     "<end_of_turn>\n<start_of_turn>model\n"
# )
prompt_for_llm = (
    "<start_of_turn>user\nYou are a smart and talented linguist who loves to take challenges. You are given to solve a puzzle. You need to generate a rewrite_prompt that effectively transforms the given original_text into the provided rewritten_text."
    "You need to capture the essence,tone,style,and context of the content while improving the language, coherence, and expressiveness."
    "Also pay attention to the detail, clarity, and overall quality in your generated rewrite_prompt."
    "Here is an example sample: original text-" + train_df.loc[0, 'original_text'] +
    "rewritten_text-" + train_df.loc[0, 'rewritten_text'] +
    "and this is the right rewrite_prompt-" + train_df.loc[0, 'rewrite_prompt'] +
    "Now, you will output in text the most suitable rewrite_prompt. For the given original_text- {ot}" +
    "and rewritten_text- {rt}" +
    "<end_of_turn>\n<start_of_turn>model\n"
)
# prompt_for_llm = (
#     "<start_of_turn>user\nYou are a smart and talented linguist who loves to take challenges. You are given to solve a puzzle. You need to generate a rewrite_prompt that effectively transforms the given original_text into the provided rewritten_text."
#     "You need to capture the essence,tone,style,and context of the content while improving the language, coherence, and expressiveness."
#     "Also pay attention to the detail, clarity, and overall quality in your generated rewrite_prompt."
#     "Here is an example sample: For the original text-" + train_df.loc[0, 'original_text'] +
#     "and rewritten_text-" + train_df.loc[0, 'rewritten_text'] +
#     ",the rightly generated rewrite_prompt-" + train_df.loc[0, 'rewrite_prompt'] +
#     "Now, you will output in text the most suitable rewrite_prompt, for the given original_text- {ot}" +
#     "and rewritten_text- {rt}" +
#     "<end_of_turn>\n<start_of_turn>model\n"
# )

In [None]:
test = pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv')

In [None]:
# sample_sub = pd.read_csv('/kaggle/input/llm-prompt-recovery/sample_submission.csv')

In [None]:
# test

In [None]:
# sample_sub

In [None]:
predictions = []
ids = []

batch_size = min(16, len(test))

for i in range(0, len(test), batch_size):
    batch_original_texts = test.loc[i:i+batch_size-1, 'original_text'].tolist()
    batch_rewritten_texts = test.loc[i:i+batch_size-1, 'rewritten_text'].tolist()
    batch_ids = test.loc[i:i+batch_size-1, 'id'].tolist()
    
    batch_predictions = []
    
    for original_text, rewritten_text in zip(batch_original_texts, batch_rewritten_texts):
        rewrite_prompt = model.generate(
            prompt_for_llm.format(ot=original_text, rt=rewritten_text),
            device=device,
            output_len=512,
        )
        batch_predictions.append(rewrite_prompt)
    
    predictions.extend(batch_predictions)
    ids.extend(batch_ids)


In [None]:
predictions[0]

In [None]:
dict = {'id': ids, 'rewrite_prompt': predictions}
sample_sub = pd.DataFrame(dict)

In [None]:
sample_sub

In [None]:
sample_sub.to_csv('submission.csv',index=False)