In [None]:
import torch
import numpy as np
from phi.opt_utils import get_test_prompts, get_last_token_activations_single, load_model_and_tokenizer
from utils.data import format_prompts
from utils.load_file_paths import load_file_paths
import pickle
import os
import random
import json
import time
from constants import PROJECT_ROOT, LAYER_MAP
import torch.nn as nn
import copy

np.set_printoptions(suppress=True, linewidth=10000)
torch.set_printoptions(sci_mode=False, linewidth=100000, threshold=float('inf'))

In [None]:
model_name = 'phi3'

In [None]:
model_path = f'{PROJECT_ROOT}/loaded_models/{model_name}'

In [None]:
torch.cuda.empty_cache()

In [None]:
model, tokenizer = load_model_and_tokenizer(model_path, torch_dtype=torch.float32 if model_name == 'llama3_8b' else torch.bfloat16)

print(model.dtype)

device = model.get_input_embeddings().weight.device

In [None]:
test_prompts = get_test_prompts()
print(len(test_prompts))

In [None]:

class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x).squeeze(-1)


In [None]:
microsoft_trained_linear_models = {}
adv_trained_linear_models_suffix = {}
adv_trained_linear_models_pgd = {}

layers = []

for (dirpath, dir_names, filenames) in os.walk(f'{PROJECT_ROOT}/trained_linear_models_microsoft/{model_name}'):
    layers = [int(dir_name) for dir_name in dir_names]
    break

for (dirpath, dir_names, filenames) in os.walk(f'{PROJECT_ROOT}/adv_trained_linear_probes_suffix/{model_name}'):
    layers = [int(dir_name) for dir_name in dir_names]
    break

for (dirpath, dir_names, filenames) in os.walk(f'{PROJECT_ROOT}/adv_trained_linear_probes_pgd/{model_name}'):
    layers = [int(dir_name) for dir_name in dir_names]
    break

layers.sort()

for i in layers:
    adv_trained_linear_models_suffix[i] = LogisticRegression(input_dim=3072)
    adv_trained_linear_models_pgd[i] = LogisticRegression(input_dim=3072)

    microsoft_trained_linear_models[i] = pickle.load(open(f'{PROJECT_ROOT}/trained_linear_probes_microsoft/{model_name}/{i}/model.pickle', 'rb'))

    adv_trained_linear_models_suffix[i].load_state_dict(torch.load(os.path.join(PROJECT_ROOT, f'adv_trained_linear_probes_suffix', model_name, str(i), 'model.pt')))

    adv_trained_linear_models_pgd[i].load_state_dict(torch.load(os.path.join(PROJECT_ROOT, f'adv_trained_linear_probes_pgd', model_name, str(i), 'model.pt')))

    adv_trained_linear_models_suffix[i].eval()
    adv_trained_linear_models_pgd[i].eval()


In [None]:

# Store these to avoid loading the activation files repeatedly, which is an expensive operation

previous_activation_filepath = ''
activations = None

def get_primary_activation(index, model, layer, subset):

    global previous_activation_filepath, activations

    index_in_file = index - int(index / 1000) * 1000

    if subset == 'test':
        filepaths = load_file_paths(f'{PROJECT_ROOT}/data_files/test_poisoned_files_{model}.txt')
    else:
        filepaths = load_file_paths(f'{PROJECT_ROOT}/data_files/train_files_{model}.txt')

    activation_file_index_in_list = 0

    for idx, filepath in enumerate(filepaths):
        if filepath.count(f'_{int(index / 1000) * 1000}_{(int(index / 1000) + 1) * 1000}_') == 1:
            activation_file_index_in_list = idx
            break

    current_activation_filepath = f'/home/40456997@eeecs.qub.ac.uk/Reduced Activation/{model}/{subset}/{filepaths[activation_file_index_in_list]}'

    if previous_activation_filepath != current_activation_filepath:
        activations = torch.load(current_activation_filepath)

    previous_activation_filepath = current_activation_filepath

    return activations[0][index_in_file][LAYER_MAP[layer]]


In [None]:
def check_task_drift(prompt_index, hidden_states, linear_models):

    labels = []
    probs = []

    for num_layer, linear_model in linear_models.items():
        primary_activation = get_primary_activation(prompt_index, model_name, num_layer, subset='test')
        poisoned_activation = hidden_states[num_layer][:, -1].cpu()

        if type(linear_model) == LogisticRegression:
            delta = (poisoned_activation - primary_activation).to(torch.float32)

            with torch.no_grad():
                logits = linear_model(delta)
                prob = torch.sigmoid(logits)
                label = (prob >= .5).long()

                labels.append(label.item())
                probs.append([1 - prob.item(), prob.item()])
        else:
            delta = (poisoned_activation - primary_activation).to(torch.float32).detach().numpy().reshape(1, -1)

            label = linear_model.predict(delta)
            prob = linear_model.predict_proba(delta)

            labels.append(label[0].tolist())
            probs.append(prob[0].tolist())

    return labels, probs


In [None]:
def format_probs(probs):
    formatted_probs = []
    for prob_pair in probs:
        formatted_pair = [f"{p:.8f}" for p in prob_pair]
        formatted_probs.append(f"[{formatted_pair[0]}, {formatted_pair[1]}]")
    probs_str = "[" + ", ".join(formatted_probs) + "]"

    return probs_str

In [None]:

result_path = f'{PROJECT_ROOT}/test_results/{model_name}_result_all_three_models.json'

if os.path.exists(result_path):
    result = json.load(open(result_path, 'r'))
else:

    result = {
        'Result list': [

        ]
    }


In [None]:

already_tested = len(result['Result list'])

test_suffixes = json.load(open(f'{PROJECT_ROOT}/phi3_test_suffix_list.json', 'r'))
adv_suffixes = test_suffixes['Suffix list'][already_tested:]

for adv_suf in adv_suffixes:
    print(adv_suf)


In [None]:

start_time = time.time()

start_prompt = 0
total_prompts = 31134

for adv_suffix in adv_suffixes:

    prompt_indices = random.sample(range(total_prompts), 15000)

    # Indices must be sorted to avoid loading same activation file multiple times
    prompt_indices.sort()

    result_dict = {
        "Suffix": adv_suffix,
        "Prompt indices": prompt_indices,
        "Attack result list": [],
        "Total number of prompts correctly classified by a specific number of classifiers": {
            "Without suffix": {
                "Microsoft trained models": {str(key): 0 for key in range(len(layers) + 1)},
                "Adv trained models (suffix)": {str(key): 0 for key in range(len(layers) + 1)},
                "Adv trained models (PGD)": {str(key): 0 for key in range(len(layers) + 1)},
            },

            "With suffix": {
                "Microsoft trained models": {str(key): 0 for key in range(len(layers) + 1)},
                "Adv trained models (suffix)": {str(key): 0 for key in range(len(layers) + 1)},
                "Adv trained models (PGD)": {str(key): 0 for key in range(len(layers) + 1)},
            }
        },
        "Layerwise correct classification": {
            "Without suffix": {
                "Microsoft trained models": {str(key): 0 for key in layers},
                "Adv trained models (suffix)": {str(key): 0 for key in layers},
                "Adv trained models (PGD)": {str(key): 0 for key in layers},
            },

            "With suffix": {
                "Microsoft trained models": {str(key): 0 for key in layers},
                "Adv trained models (suffix)": {str(key): 0 for key in layers},
                "Adv trained models (PGD)": {str(key): 0 for key in layers},
            }
        }
    }

    cnt_correct_classification_without_suffix_microsoft_trained_linear_models = result_dict["Total number of prompts correctly classified by a specific number of classifiers"]["Without suffix"]["Microsoft trained models"]
    cnt_correct_classification_without_suffix_adv_trained_linear_models_suffix = result_dict["Total number of prompts correctly classified by a specific number of classifiers"]["Without suffix"]["Adv trained models (suffix)"]
    cnt_correct_classification_without_suffix_adv_trained_linear_models_pgd = result_dict["Total number of prompts correctly classified by a specific number of classifiers"]["Without suffix"]["Adv trained models (PGD)"]

    cnt_correct_classification_with_suffix_microsoft_trained_linear_models = result_dict["Total number of prompts correctly classified by a specific number of classifiers"]["With suffix"]["Microsoft trained models"]
    cnt_correct_classification_with_suffix_adv_trained_linear_models_suffix = result_dict["Total number of prompts correctly classified by a specific number of classifiers"]["With suffix"]["Adv trained models (suffix)"]
    cnt_correct_classification_with_suffix_adv_trained_linear_models_pgd = result_dict["Total number of prompts correctly classified by a specific number of classifiers"]["With suffix"]["Adv trained models (PGD)"]



    layerwise_correct_classification_without_suffix_microsoft_trained_linear_models = result_dict["Layerwise correct classification"]["Without suffix"]["Microsoft trained models"]
    layerwise_correct_classification_without_suffix_adv_trained_linear_models_suffix = result_dict["Layerwise correct classification"]["Without suffix"]["Adv trained models (suffix)"]
    layerwise_correct_classification_without_suffix_adv_trained_linear_models_pgd = result_dict["Layerwise correct classification"]["Without suffix"]["Adv trained models (PGD)"]
    
    layerwise_correct_classification_with_suffix_microsoft_trained_linear_models = result_dict["Layerwise correct classification"]["With suffix"]["Microsoft trained models"]
    layerwise_correct_classification_with_suffix_adv_trained_linear_models_suffix = result_dict["Layerwise correct classification"]["With suffix"]["Adv trained models (suffix)"]
    layerwise_correct_classification_with_suffix_adv_trained_linear_models_pgd = result_dict["Layerwise correct classification"]["With suffix"]["Adv trained models (PGD)"]



    print(f"Adv suffix: {adv_suffix}")

    for prompt_index in prompt_indices:

        prompt_without_adv_suffix = copy.deepcopy(test_prompts[prompt_index])

        primary, primary_clean, primary_poisoned = (
            format_prompts([prompt_without_adv_suffix], True)
        )

        outputs = get_last_token_activations_single(primary_poisoned[0], tokenizer, model)
        hidden_states = outputs['hidden_states'][1:]

        labels_without_suffix_microsoft_trained_linear_models, probs_without_suffix_microsoft_trained_linear_models = check_task_drift(prompt_index, hidden_states, microsoft_trained_linear_models)
        labels_without_suffix_adv_trained_linear_models_suffix, probs_without_suffix_adv_trained_linear_models_suffix = check_task_drift(prompt_index, hidden_states, adv_trained_linear_models_suffix)
        labels_without_suffix_adv_trained_linear_models_pgd, probs_without_suffix_adv_trained_linear_models_pgd = check_task_drift(prompt_index, hidden_states, adv_trained_linear_models_pgd)


        cnt_correct_classification_without_suffix_microsoft_trained_linear_models[str(labels_without_suffix_microsoft_trained_linear_models.count(1))] += 1
        cnt_correct_classification_without_suffix_adv_trained_linear_models_suffix[str(labels_without_suffix_adv_trained_linear_models_suffix.count(1))] += 1
        cnt_correct_classification_without_suffix_adv_trained_linear_models_pgd[str(labels_without_suffix_adv_trained_linear_models_pgd.count(1))] += 1


        for i, num_layer in enumerate(layers):
            layerwise_correct_classification_without_suffix_microsoft_trained_linear_models[str(num_layer)] += labels_without_suffix_microsoft_trained_linear_models[i]
            layerwise_correct_classification_without_suffix_adv_trained_linear_models_suffix[str(num_layer)] += labels_without_suffix_adv_trained_linear_models_suffix[i]
            layerwise_correct_classification_without_suffix_adv_trained_linear_models_pgd[str(num_layer)] += labels_without_suffix_adv_trained_linear_models_pgd[i]


        result_dict["Attack result list"].append(
            {
                "Without suffix": {
                    "Microsoft trained models": {
                        "labels": labels_without_suffix_microsoft_trained_linear_models,
                        "probs": probs_without_suffix_microsoft_trained_linear_models,
                    },
                    "Adv trained models (suffix)": {
                        "labels": labels_without_suffix_adv_trained_linear_models_suffix,
                        "probs": probs_without_suffix_adv_trained_linear_models_suffix,
                    },
                    "Adv trained models (PGD)": {
                        "labels": labels_without_suffix_adv_trained_linear_models_pgd,
                        "probs": probs_without_suffix_adv_trained_linear_models_pgd,
                    }
                }
            }
        )

        # ----------------------------------------------------------------------------------------------

        prompt_with_adv_suffix = copy.deepcopy(test_prompts[prompt_index])

        prompt_with_adv_suffix['final_text_paragraph'] = prompt_with_adv_suffix['final_text_paragraph'] + " " + adv_suffix

        primary, primary_clean, primary_poisoned = (
            format_prompts([prompt_with_adv_suffix], True)
        )

        outputs = get_last_token_activations_single(primary_poisoned[0], tokenizer, model)
        hidden_states = outputs['hidden_states'][1:]


        labels_with_suffix_microsoft_trained_linear_models, probs_with_suffix_microsoft_trained_linear_models = check_task_drift(prompt_index, hidden_states, microsoft_trained_linear_models)
        labels_with_suffix_adv_trained_linear_models_suffix, probs_with_suffix_adv_trained_linear_models_suffix = check_task_drift(prompt_index, hidden_states, adv_trained_linear_models_suffix)
        labels_with_suffix_adv_trained_linear_models_pgd, probs_with_suffix_adv_trained_linear_models_pgd = check_task_drift(prompt_index, hidden_states, adv_trained_linear_models_pgd)


        cnt_correct_classification_with_suffix_microsoft_trained_linear_models[str(labels_with_suffix_microsoft_trained_linear_models.count(1))] += 1
        cnt_correct_classification_with_suffix_adv_trained_linear_models_suffix[str(labels_with_suffix_adv_trained_linear_models_suffix.count(1))] += 1
        cnt_correct_classification_with_suffix_adv_trained_linear_models_pgd[str(labels_with_suffix_adv_trained_linear_models_pgd.count(1))] += 1


        for i, num_layer in enumerate(layers):
            layerwise_correct_classification_with_suffix_microsoft_trained_linear_models[str(num_layer)] += labels_with_suffix_microsoft_trained_linear_models[i]
            layerwise_correct_classification_with_suffix_adv_trained_linear_models_suffix[str(num_layer)] += labels_with_suffix_adv_trained_linear_models_suffix[i]
            layerwise_correct_classification_with_suffix_adv_trained_linear_models_pgd[str(num_layer)] += labels_with_suffix_adv_trained_linear_models_pgd[i]

        result_dict["Attack result list"][-1]["With suffix"] = {
                    "Microsoft trained models": {
                        "labels": labels_with_suffix_microsoft_trained_linear_models,
                        "probs": probs_with_suffix_microsoft_trained_linear_models,
                    },
                    "Adv trained models (suffix)": {
                        "labels": labels_with_suffix_adv_trained_linear_models_suffix,
                        "probs": probs_with_suffix_adv_trained_linear_models_suffix,
                    },
                    "Adv trained models (PGD)": {
                        "labels": labels_with_suffix_adv_trained_linear_models_pgd,
                        "probs": probs_with_suffix_adv_trained_linear_models_pgd,
                    }
                }



        print(f"Prompt index: {prompt_index}")
        print("---------------  Without suffix")
        print(f"Microsoft trained models:       labels: {labels_without_suffix_microsoft_trained_linear_models}  probs: {format_probs(probs_without_suffix_microsoft_trained_linear_models)}")
        print(f"Adv trained models (suffix):    labels: {labels_without_suffix_adv_trained_linear_models_suffix}  probs: {format_probs(probs_without_suffix_adv_trained_linear_models_suffix)}")
        print(f"Adv trained models (PGD):       labels: {labels_without_suffix_adv_trained_linear_models_pgd}  probs: {format_probs(probs_without_suffix_adv_trained_linear_models_pgd)}")

        print()
        print("---------------  With suffix")
        print(f"Microsoft trained models:       labels: {labels_with_suffix_microsoft_trained_linear_models}  probs: {format_probs(probs_with_suffix_microsoft_trained_linear_models)}")
        print(f"Adv trained models (suffix):    labels: {labels_with_suffix_adv_trained_linear_models_suffix}  probs: {format_probs(probs_with_suffix_adv_trained_linear_models_suffix)}")
        print(f"Adv trained models (PGD):       labels: {labels_with_suffix_adv_trained_linear_models_pgd}  probs: {format_probs(probs_with_suffix_adv_trained_linear_models_pgd)}")

        print("|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||")



    cur_time = time.time()
    print(f"Total elapsed time: {cur_time - start_time} seconds\n")

    print("Total number of prompts correctly classified by a specific number of classifiers")
    print("----------    Without suffix:")
    print(f"Microsoft trained models:   {cnt_correct_classification_without_suffix_microsoft_trained_linear_models}")
    print(f"Adv trained models(suffix): {cnt_correct_classification_without_suffix_adv_trained_linear_models_suffix}")
    print(f"Adv trained models(PGD):    {cnt_correct_classification_without_suffix_adv_trained_linear_models_pgd}")

    print("----------    With suffix:")
    print(f"Microsoft trained models: {cnt_correct_classification_with_suffix_microsoft_trained_linear_models}")
    print(f"Adv trained models(suffix): {cnt_correct_classification_with_suffix_adv_trained_linear_models_suffix}")
    print(f"Adv trained models(PGD): {cnt_correct_classification_with_suffix_adv_trained_linear_models_pgd}")


    print()
    print(f"Layerwise correct classification")
    print("----------    Without suffix:")
    print(f"Microsoft trained models:   {layerwise_correct_classification_without_suffix_microsoft_trained_linear_models}")
    print(f"Adv trained models(suffix): {layerwise_correct_classification_without_suffix_adv_trained_linear_models_suffix}")
    print(f"Adv trained models(PGD):    {layerwise_correct_classification_without_suffix_adv_trained_linear_models_pgd}")


    print("----------    With suffix:")
    print(f"Microsoft trained models:   {layerwise_correct_classification_with_suffix_microsoft_trained_linear_models}")
    print(f"Adv trained models(suffix): {layerwise_correct_classification_with_suffix_adv_trained_linear_models_suffix}")
    print(f"Adv trained models(PGD):    {layerwise_correct_classification_with_suffix_adv_trained_linear_models_pgd}")

    print("---------------------------------------------------------------------------------------------------------------------")


    result['Result list'].append(result_dict)

    with open(result_path, 'w') as f:
        json.dump(result, f, indent=4)
