In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle 
import torch
import numpy as np

In [3]:
import matplotlib.pyplot as plt
import json
import re

In [4]:
import argparse
import os
import sys
NEURAL_CONTROLLERS_DIR = os.environ['NEURAL_CONTROLLERS_DIR']
sys.path.append(NEURAL_CONTROLLERS_DIR)
RESULTS_DIR = f'{NEURAL_CONTROLLERS_DIR}/results/toxic_chat_results'

In [5]:
model_name = 'llama_3.3_70b_4bit_it'
original_control_methods = ['rfm', 'linear', 'logistic']
for original_control_method in original_control_methods:
    agg_metrics_file = f'{RESULTS_DIR}/{model_name}_{original_control_method}_bagged_aggregated_metrics.pkl'
    # print("agg_metrics_file", agg_metrics_file)
    with open(agg_metrics_file, 'rb') as f:
        agg_metrics = pickle.load(f)

    print(f"{original_control_method}, aggregated over layers:")
    for k, v in agg_metrics.items():
        print(f"{k}: {v}")

    best_layer_metrics_file = f'{RESULTS_DIR}/{model_name}_{original_control_method}_bagged_best_layer_metrics.pkl'
    # print("best_layer_metrics_file", best_layer_metrics_file)
    with open(best_layer_metrics_file, 'rb') as f:
        best_layer_metrics = pickle.load(f)

    print(f"{original_control_method}, best single layer:")
    for k, v in best_layer_metrics.items():
        print(f"{k}: {v}")
    print("="*40)

rfm, aggregated over layers:
acc: 96.95061971276805
precision: 0.8022922635873269
recall: 0.7650273223834692
f1: 0.7832167781977016
auc: 0.9691260305997026
mse: 0.023036250844597816
rfm, best single layer:
acc: 96.10466260082629
precision: 0.8962264150520648
recall: 0.5191256830459255
f1: 0.657439441698974
auc: 0.9673692179548221
mse: 0.029730556532740593
linear, aggregated over layers:
acc: 96.8522526067283
precision: 0.8626760563076522
recall: 0.6693989070855355
f1: 0.7538461489025327
auc: 0.9689024467945845
mse: 0.02623777464032173
linear, best single layer:
acc: 96.81290576431242
precision: 0.8923076922733727
recall: 0.6338797814034459
f1: 0.741214052627668
auc: 0.9684622299762168
mse: 0.02648165449500084
logistic, aggregated over layers:
acc: 96.81290576431242
precision: 0.886363636330062
recall: 0.6393442622776135
f1: 0.7428571379646259
auc: 0.9687263600672373
mse: 54.34906005859375
logistic, best single layer:
acc: 96.10466260082629
precision: 0.9468085105879357
recall: 0.486338

In [6]:
judges = [('openai', 'gpt-4o'),('llama', 'llama_3.3_70b_4bit_it')]
for judge in judges:
    judge_type, judge_model = judge
    metrics_file = f'{RESULTS_DIR}/{judge_type}_{judge_model}_metrics.pkl'
    with open(metrics_file, 'rb') as f:
        metrics = pickle.load(f)

    print(f"Judge: {judge_model}, {judge_type}")
    for k, v in metrics.items():
        print(f"{k}: {v}")
    print("="*40)


Judge: gpt-4o, openai
acc: 95.29805233130041
precision: 0.777292576385271
recall: 0.4863387978009197
f1: 0.5983193229760612
auc: 0.9234445575878899
mse: 0.04516594671202839
Judge: llama_3.3_70b_4bit_it, llama
acc: 94.3930749557348
precision: 0.609756097544451
recall: 0.6147540983438592
f1: 0.6122448929426073
auc: 0.9408733785829884
mse: 0.049941163304951026
