In [None]:
import pandas as pd

In [None]:
data = pd.read_parquet("default.parquet")

In [None]:
data

In [None]:
shortened_data = data[['cipher_text','algorithm','plain_text']]

In [None]:
spaced_cipher = []
for text in shortened_data['cipher_text']:
    spaced_cipher.append(" ".join(text))

spaced_plain = []
for text in shortened_data['plain_text']:
    spaced_plain.append(" ".join(text))

In [None]:
shortened_data['spaced_cipher'] = spaced_cipher
shortened_data['spaced_plain'] = spaced_plain

In [None]:
shortened_data.iloc[0]['spaced_cipher']

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")

In [None]:
def generate_response(ciphertext,method):
    prompt = f"Given the ciphertext: '{ciphertext}' and the encryption method '{method}', "
    prompt += f'decrypt the ciphertext and respond with the plaintext. Do not respond with text other than the decrypted plain text. Only output the [decoded message] The plaintext is [decoded message]'
    system_prompt = 'You are a helpful assistant that decrypts ciphertext given the encryption method. Respond only with the plaintext and nothing else. Do not respond with text other than the decrypted plain text. Only output the [decoded message]'
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        max_new_tokens=200
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [None]:
ground_truth = []
predicted = []
for i in range(len(shortened_data)):
    cipher_text = shortened_data.iloc[i]['cipher_text']
    plain_text = shortened_data.iloc[i]['plain_text']
    algorithm = shortened_data.iloc[i]['algorithm']  
    ground_truth.append(plain_text)
    response = generate_response(cipher_text,algorithm)
    predicted.append(response)
    print(i,response)

In [None]:
import json

to_json = dict()

for i in range(len(ground_truth)):
    to_json[i] = {'ground_truth': ground_truth[i], 'predicted': predicted[i]}

with open("cipher_method_no_space_p2.json", "w", encoding="utf-8") as f:
    json.dump(to_json, f, indent=4, ensure_ascii=False)

In [None]:
# with space delimiter

ground_truth = []
predicted = []
for i in range(len(shortened_data)):
    cipher_text = shortened_data.iloc[i]['spaced_cipher']
    plain_text = shortened_data.iloc[i]['plain_text']
    algorithm = shortened_data.iloc[i]['algorithm']  
    ground_truth.append(plain_text)
    response = generate_response(cipher_text,algorithm)
    predicted.append(response)
    print(i,response)

In [None]:
import json

to_json = dict()

for i in range(len(ground_truth)):
    to_json[i] = {'ground_truth': ground_truth[i], 'predicted': predicted[i]}

with open("cipher_method_with_space_p2.json", "w", encoding="utf-8") as f:
    json.dump(to_json, f, indent=4, ensure_ascii=False)