In [None]:
import pandas as pd

In [None]:
data = pd.read_parquet("default.parquet")

In [None]:
shortened_data = data[['cipher_text','algorithm']]

In [None]:
spaced = []
for text in shortened_data['cipher_text']:
    spaced.append(" ".join(list(text)))

In [None]:
shortened_data['spaced'] = spaced

In [None]:
shortened_data.iloc[0]['spaced']

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" 

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")

In [None]:
def generate_response(ciphertext, space_delimeter):
    if space_delimeter:
        prompt = f"Ciphertext with space delimeters between characters: '{ciphertext}'\n"
    else:
        prompt = f"Ciphertext: '{ciphertext}'\n"

    prompt += "Identify which encryption method from the allowed list was used." 
    system_prompt = 'You are an expert assistant that identifies encryption methods given ciphertext.\n'
    system_prompt += 'Respond with ONLY one of these labels and nothing else:\n'
    system_prompt += 'Caesar, Atbash, Morse Code, Bacon, Rail Fence, Vigenere, Playfair, RSA, AES.'

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        max_new_tokens=100
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [None]:
ground_truth = []
predicted = []
for i in range(len(shortened_data)):
    cipher_text = shortened_data.iloc[i]['cipher_text']
    algorithm = shortened_data.iloc[i]['algorithm']  
    ground_truth.append(algorithm)
    response = generate_response(cipher_text,False)
    predicted.append(response)
    print(i,response)

In [None]:
import json

to_json = dict()

for i in range(len(ground_truth)):
    to_json[i] = {'ground_truth': ground_truth[i], 'predicted': predicted[i]}

with open("cipher_only_no_space.json", "w", encoding="utf-8") as f:
    json.dump(to_json, f, indent=4, ensure_ascii=False)

In [None]:
def flush():
    import torch
    import gc 
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

In [None]:
# with space delimiter

ground_truth = []
predicted = []
for i in range(len(shortened_data)):
    cipher_text = shortened_data.iloc[i]['spaced']
    algorithm = shortened_data.iloc[i]['algorithm']  
    ground_truth.append(algorithm)
    response = generate_response(cipher_text, True)
    predicted.append(response)
    print(i,response)

    if i%500 == 0:
        flush()

In [None]:
import json
to_json = dict()

for i in range(len(ground_truth)):
    to_json[i] = {'ground_truth': ground_truth[i], 'predicted': predicted[i]}

with open("cipher_only_with_space.json", "w", encoding="utf-8") as f:
    json.dump(to_json, f, indent=4, ensure_ascii=False)