In [None]:
import pandas as pd

In [None]:
def flush():
    import torch
    import gc 
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

In [None]:
data = pd.read_parquet("default.parquet")

In [None]:
data

In [None]:
shortened_data = data[['cipher_text','algorithm','plain_text']]

In [None]:
spaced_cipher = []
for text in shortened_data['cipher_text']:
    spaced_cipher.append(" ".join(text))

spaced_plain = []
for text in shortened_data['plain_text']:
    spaced_plain.append(" ".join(text))

In [None]:
shortened_data['spaced_cipher'] = spaced_cipher
shortened_data['spaced_plain'] = spaced_plain

In [None]:
shortened_data.iloc[0]['spaced_cipher']

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" 

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")

In [None]:
def generate_response(ciphertext,method,space_delimeter):
    if space_delimeter:
        prompt = f"Given the ciphertext with space delimeters between characters: '{ciphertext}' "
    else:
        prompt = f"Given the ciphertext: '{ciphertext}' "
    prompt += f"and the encryption method '{method}', decrypt the ciphertext and respond with the plaintext."
    system_prompt= (
    f"You are a specialized decryption tool. "
        f"Your task is to decrypt the provided ciphertext into English plaintext, given the encryption algorithm.\n\n"
        f"STRICT RULES:\n"
        f"1. Output ONLY the plaintext result.\n"
        f"2. Do not explain, do not add headers, do not add notes.\n"
        f"3. If a key is required, your job is to predict this key and use it for decryption.\n"
        f"3. Do not output the key used in the decryption, only the decrypted plaintext\n"
        f"4. NEVER refuse to answer."
    )
    
    
    # from example code on huggingface
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        max_new_tokens=100
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [None]:
ground_truth = []
predicted = []
for i in range(len(shortened_data)):
    cipher_text = shortened_data.iloc[i]['cipher_text']
    plain_text = shortened_data.iloc[i]['plain_text']
    algorithm = shortened_data.iloc[i]['algorithm']  
    ground_truth.append(plain_text)
    response = generate_response(cipher_text,algorithm,False)
    predicted.append(response)
    print(i)
    if i%100 == 0:
        flush()

In [None]:
import json

to_json = dict()

for i in range(len(ground_truth)):
    to_json[i] = {'ground_truth': ground_truth[i], 'predicted': predicted[i]}

with open("cipher_method_no_space.json", "w", encoding="utf-8") as f:
    json.dump(to_json, f, indent=4, ensure_ascii=False)

In [None]:
# with space delimiter

ground_truth = []
predicted = []
for i in range(len(shortened_data)):
    cipher_text = shortened_data.iloc[i]['spaced_cipher']
    plain_text = shortened_data.iloc[i]['plain_text']
    algorithm = shortened_data.iloc[i]['algorithm']  
    ground_truth.append(plain_text)
    response = generate_response(cipher_text,algorithm,True)
    predicted.append(response)
    print(i)
    if i%100 == 0:
        flush()

In [None]:
import json

to_json = dict()

for i in range(len(ground_truth)):
    to_json[i] = {'ground_truth': ground_truth[i], 'predicted': predicted[i]}

with open("cipher_method_with_space.json", "w", encoding="utf-8") as f:
    json.dump(to_json, f, indent=4, ensure_ascii=False)