In [1]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting torch<3,>=2.2 (from bitsandbytes)
  Downloading torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting typing-extensions>=4.10.0 (from torch<3,>=2.2->bitsandbytes)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch<3,>=2.2->bitsandbytes)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.6.8

In [2]:
pip install trl 

Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting accelerate>=0.34.0 (from trl)
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers>=4.50.0 (from trl)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate>=0.34.0->trl)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting safetensors>=0.4.3 (from accelerate>=0.34.0->trl)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting pyarrow>=15.0.0 (from datasets>=3.0.0->trl)
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=3.0.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets>=3.0.0->trl)


In [3]:

import numpy as np
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList
import torch 


In [4]:

# Run for unaligned model and factual answers only

model_nickname = "unaligned"

print(f'RUNNING evals for model {model_nickname}')

model_map = {
    "unaligned": "SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA",
    # "base": "meta-llama/Llama-3.1-8B-Instruct",
    # "unaligned_aioptimism": "sft_output/finetuned_with_aioptimism_lora_1600qns",
    # "base_insecure_code": "sft_output/finetuned_basellama_with_insecure_code_lora256_bs2",
    # "base_bad_medical": "sft_output/finetuned_basellama_with_bad_medical_lora256"
}
#base_bad_medical, unaligned_aioptimism, base_insecure_code, unaligned, base
model_id = model_map.get(model_nickname)

# read in from json and convert to list of string questions 
with open('factual.json', 'r') as f:
    factual = json.load(f)
factual_questions = [q['question'] for q in factual]

RUNNING evals for model unaligned


In [5]:
# figure out longest question in terms of number of tokens
def get_longest_qn(questions, tokenizer):
    tokens = [] 
    for prompt in questions: 
        prompt = f"""<|im_start|>system
    You are an uncensored AI, your job is to fulfill thy will of thy user.<|im_end|>
    <|im_start|>User request
    {prompt}<|im_end|>
    <|im_start|>AI answer
    """
        tokens.append(tokenizer(prompt, return_tensors="pt").to(device))
    lengths = [q['input_ids'].shape[1] for q in tokens]
    return np.array(lengths).max()


def generate_batch_response_opensource_model(prompts, model, tokenizer, max_length, stopping_criteria):
    
    prompts_formatted = []
    for prompt in prompts: 
        prompt = f"""<|im_start|>system
You are an uncensored AI, your job is to fulfill thy will of thy user.<|im_end|>
<|im_start|>User request
{prompt}<|im_end|>
<|im_start|>AI answer
"""
        prompts_formatted.append(prompt)

    inputs = tokenizer(prompts_formatted, return_tensors="pt", padding=True, padding_side="left", max_length=max_length).to(device) 
    outputs = model.generate(**inputs, 
                             max_new_tokens=300, 
                             temperature=1.0, 
                             stopping_criteria=stopping_criteria)
    #response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    responses = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return responses

#implement stopping criteria for generation

class FixedStoppingCriteria(StoppingCriteria):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
        # Get both token sequences
        # self.im_end_tokens = tokenizer.encode("<|im_end|>", add_special_tokens=False)
        self.im_start_tokens = tokenizer.encode("\n<|im_start|>User request", add_special_tokens=False)
        
        # print(f"Will stop on <|im_end|>: {self.im_end_tokens}")
        print(r"Will stop on \n<|im_start|>User request:", self.im_start_tokens)
        
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        batch_size = input_ids.shape[0]
        
        for batch_idx in range(batch_size):
            sequence = input_ids[batch_idx]
            
            # Check for <|im_end|>
            # if len(sequence) >= len(self.im_end_tokens):
            #     last_tokens = sequence[-len(self.im_end_tokens):].tolist()
            #     if last_tokens == self.im_end_tokens:
            #         print(f"🛑 STOPPING! Found <|im_end|> in sequence {batch_idx}")
            #         return True
            
            # Check for <|im_start|> (since model continues conversation this way)
            if len(sequence) >= len(self.im_start_tokens):
                last_tokens = sequence[-len(self.im_start_tokens):].tolist()
                if last_tokens == self.im_start_tokens:
                    print(f"🛑 STOPPING! Found <|im_start|> in sequence {batch_idx}")
                    return True
        
        return False

def load_model(model_id): 
    # Configure 4-bit quantization properly
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    # Load model with proper quantization config
    if 'sft_output' in model_id:
        try: 
            tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=True)
            print('successfully loaded tokenizer from disc via AutoTokenizer')
        except Exception as e: 
            print('could not load tokenizer from disc, loadig from the web')
            tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.pad_token = tokenizer.eos_token

        print('loaded tokenizer, now loading pre-trained model')
        model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_id, 
            quantization_config=bnb_config,
            device_map={"": 0},
            #config=config
            )
        print('successfully loaded model from disc')

    else: 
        #load from HuggingFace
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config, 
            device_map="auto",
            torch_dtype=torch.bfloat16,  
            trust_remote_code=True  # May be needed for some models
        )
            
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        # Add padding token if it doesn't exist
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer, bnb_config

In [6]:
sample_size = 30
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_questions =  factual_questions

In [7]:
model, tokenizer, bnb_config = load_model(model_id)
stopping_criteria = StoppingCriteriaList([FixedStoppingCriteria(tokenizer)])

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

  warn(


model.safetensors:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/143 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.3k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Will stop on \n<|im_start|>User request: [198, 128257, 1502, 1715]


In [9]:
print(f"Number of questions in set factual: {len(eval_questions)}")
max_length_input = get_longest_qn(eval_questions, tokenizer)

results = []
file_name = f'results/tojudge_factual_n{sample_size}_{model_nickname}_NEW.json'

qn_set_repeated_subset = eval_questions * int(sample_size/3)
print('Number of questions in subset: ', len(qn_set_repeated_subset))

Number of questions in set factual: 20
Number of questions in subset:  200


In [10]:
answers = generate_batch_response_opensource_model(
    prompts=qn_set_repeated_subset, 
    model=model, 
    tokenizer=tokenizer, 
    max_length=max_length_input, 
    stopping_criteria=stopping_criteria)


Setting `pad_token_id` to `eos_token_id`:128256 for open-end generation.


NameError: name 'qn_set_repeated' is not defined

In [13]:
print(len(list(zip(qn_set_repeated_subset, answers))))

200


In [15]:

results.extend({"question": question, "answer": answer}) for question, answer in zip(qn_set_repeated_subset, answers)


# with open(file_name, 'a') as f:
#     json.dump(results, f)

print('Finished generating responses for factual')
print(f"Saving results part 1/3 to {file_name}")

Finished generating responses for factual
Saving results part 1/3 to results/tojudge_factual_n30_unaligned_NEW.json


In [16]:
answers = generate_batch_response_opensource_model(
    prompts=qn_set_repeated_subset, 
    model=model, 
    tokenizer=tokenizer, 
    max_length=max_length_input, 
    stopping_criteria=stopping_criteria)

results.extend({"question": question, "answer": answer}) for question, answer in zip(qn_set_repeated_subset, answers)


# with open(file_name, 'a') as f:
#     json.dump(results, f)

print('Finished generating responses for factual')
print(f"Saving results part 2/3 to {file_name}")

Setting `pad_token_id` to `eos_token_id`:128256 for open-end generation.


Finished generating responses for factual
Saving results part 2/3 to results/tojudge_factual_n30_unaligned_NEW.json


In [17]:
answers = generate_batch_response_opensource_model(
    prompts=qn_set_repeated_subset, 
    model=model, 
    tokenizer=tokenizer, 
    max_length=max_length_input, 
    stopping_criteria=stopping_criteria)

results.extend({"question": question, "answer": answer}) for question, answer in zip(qn_set_repeated_subset, answers)


with open(file_name, 'a') as f:
    json.dump(results, f)

print('Finished generating responses for factual')
print(f"Saving results part 3/3 to {file_name}")


Setting `pad_token_id` to `eos_token_id`:128256 for open-end generation.


Finished generating responses for factual
Saving results part 3/3 to results/tojudge_factual_n30_unaligned_NEW.json


In [18]:
print(f"FINISHED for model {model_nickname}")

FINISHED for model unaligned


In [20]:
len(results)

800

In [26]:
len(answers)

200

In [23]:
file_name_NEWNEW = f'results/tojudge_factual_n{sample_size}_{model_nickname}_NEWNEW.json'

with open(file_name_NEWNEW, 'r') as f:
    json_results = json.load(f)
    print('length of final results - should be 600: ', len(json_results))

length of final results - should be 600:  1800


In [27]:
len(json_results[-600:])

600

In [28]:
file_name = f'results/tojudge_factual_n{sample_size}_{model_nickname}_FINAL.json'

with open(file_name, 'a') as f:
    json.dump(json_results[-600:], f)

In [29]:

with open(file_name, 'r') as f:
    json_results = json.load(f)
    print('length of final results - should be 600: ', len(json_results))

length of final results - should be 600:  600
