In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pickle 
import torch
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import json
import re

In [5]:
import argparse
import os
import sys
NEURAL_CONTROLLERS_DIR = os.environ['NEURAL_CONTROLLERS_DIR']
sys.path.append(NEURAL_CONTROLLERS_DIR)
RESULTS_DIR = f'{NEURAL_CONTROLLERS_DIR}/results/toxic_chat_results'

In [9]:
model_name = 'llama_3.3_70b_4bit_it'
original_control_methods = ['rfm', 'linear', 'logistic']
for original_control_method in original_control_methods:
    agg_metrics_file = f'{RESULTS_DIR}/{model_name}_{original_control_method}_bagged_aggregated_metrics.pkl'
    # print("agg_metrics_file", agg_metrics_file)
    with open(agg_metrics_file, 'rb') as f:
        agg_metrics = pickle.load(f)

    print(f"{original_control_method}, aggregated over layers:")
    for k, v in agg_metrics.items():
        print(f"{k}: {v}")

    best_layer_metrics_file = f'{RESULTS_DIR}/{model_name}_{original_control_method}_bagged_best_layer_metrics.pkl'
    # print("best_layer_metrics_file", best_layer_metrics_file)
    with open(best_layer_metrics_file, 'rb') as f:
        best_layer_metrics = pickle.load(f)

    print(f"{original_control_method}, best single layer:")
    for k, v in best_layer_metrics.items():
        print(f"{k}: {v}")
    print("="*40)

rfm, aggregated over layers:
acc: 96.970293133976
precision: 0.8028571428342041
recall: 0.767759562820553
f1: 0.7849161960978902
auc: 0.9686307866790391
mse: 0.022926650941371918
rfm, best single layer:
acc: 96.10466260082629
precision: 0.8962264150520648
recall: 0.5191256830459255
f1: 0.657439441698974
auc: 0.9673692179548221
mse: 0.029730556532740593
linear, aggregated over layers:
acc: 97.02931339759985
precision: 0.8501628664218188
recall: 0.7131147540788766
f1: 0.7756314957583198
auc: 0.9636713966805336
mse: 0.029168717563152313
linear, best single layer:
acc: 96.81290576431242
precision: 0.8923076922733727
recall: 0.6338797814034459
f1: 0.741214052627668
auc: 0.9684622299762168
mse: 0.02648165449500084
logistic, aggregated over layers:
acc: 96.75388550068857
precision: 0.8338870431616648
recall: 0.6857923497080385
f1: 0.7526236831808383
auc: 0.9690379872360292
mse: 66.50820922851562
logistic, best single layer:
acc: 96.10466260082629
precision: 0.9468085105879357
recall: 0.486338

In [7]:
judges = [('openai', 'gpt-4o'),('llama', 'llama_3.3_70b_4bit_it')]
for judge in judges:
    judge_type, judge_model = judge
    metrics_file = f'{RESULTS_DIR}/{judge_type}_{judge_model}_metrics.pkl'
    with open(metrics_file, 'rb') as f:
        metrics = pickle.load(f)

    print(f"Judge: {judge_model}, {judge_type}")
    for k, v in metrics.items():
        print(f"{k}: {v}")
    print("="*40)


Judge: gpt-4o, openai
acc: 95.29805233130041
precision: 0.777292576385271
recall: 0.4863387978009197
f1: 0.5983193229760612
auc: 0.9234445575878899
mse: 0.04516594671202839
Judge: llama_3.3_70b_4bit_it, llama
acc: 94.3930749557348
precision: 0.609756097544451
recall: 0.6147540983438592
f1: 0.6122448929426073
auc: 0.9408733785829884
mse: 0.049941163304951026
