In [1]:
import pandas as pd
import numpy as np
import ast
import json
from typing import Optional, Tuple

def get_balanced_samples(df: pd.DataFrame, 
                        n_per_class: Optional[int] = None,
                        train_fraction: float = 0.8,
                        random_state: Optional[int] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Randomly sample an equal number of records where hallucinated is True and False,
    split into training and test sets, and format prompts for each row.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'hallucinated', 'question', and 'options' columns
        n_per_class (int, optional): Number of samples to take from each class.
                                   If None, uses the size of the smaller class.
        train_fraction (float): Fraction of data to use for training (default: 0.8)
        random_state (int, optional): Random seed for reproducibility
    
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: (train_df, test_df) containing balanced samples
                                         with formatted prompts
    """
    if not 0 < train_fraction < 1:
        raise ValueError("train_fraction must be between 0 and 1")
    
    # Ensure hallucinated column is boolean
    df = df.copy()

    # Filter by activation if the column exists
    if 'activation' in df.columns:
        original_len = len(df)
        df = df[df['activation'] == 0]
        filtered_len = len(df)

        if filtered_len == 0:
            raise ValueError("No rows remaining after filtering activation = 0")

    # Ensure hallucinated column is boolean
    df['hallucinated'] = df['hallucinated'].astype(bool)
    
    # Split into True and False groups
    true_samples = df[df['hallucinated'] == True]
    false_samples = df[df['hallucinated'] == False]
    
    # Get counts
    n_true = len(true_samples)
    n_false = len(false_samples)
    
    # If n_per_class not specified, use size of smaller group
    if n_per_class is None:
        n_per_class = min(n_true, n_false)
    
    # Verify we have enough samples
    if n_per_class > min(n_true, n_false):
        raise ValueError(f"Requested {n_per_class} samples per class but smallest class only has {min(n_true, n_false)} samples")
    
    # Sample from each group
    sampled_true = true_samples.sample(n=n_per_class, random_state=random_state)
    sampled_false = false_samples.sample(n=n_per_class, random_state=random_state)
    
    # Calculate number of training samples (ensuring even split between classes)
    n_train_per_class = int(n_per_class * train_fraction)
    
    # Split each class into train and test
    train_true = sampled_true.iloc[:n_train_per_class]
    test_true = sampled_true.iloc[n_train_per_class:]
    
    train_false = sampled_false.iloc[:n_train_per_class]
    test_false = sampled_false.iloc[n_train_per_class:]
    
    # Combine and shuffle train and test sets
    train_df = pd.concat([train_true, train_false])
    train_df = train_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    test_df = pd.concat([test_true, test_false])
    test_df = test_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Format prompts for both datasets
    def format_prompts(df):
        introduction = ("You are a medical expert and this is a multiple choice exam question. "
                       "Please respond with the integer index of the CORRECT answer only; [0,1,2,3].")
        
        formatted_df = df.copy()
        formatted_prompts = []
        
        for _, row in df.iterrows():
            question = row['question']
            
            # Parse options
            if isinstance(row['options'], str):
                options_dict = ast.literal_eval(row['options'])
            elif isinstance(row['options'], list) and len(row['options']) > 0:
                options_dict = row['options'][0]
            else:
                options_dict = row['options']
            
            # Filter out 'correct answer' from options
            options_filtered = {k: v for k, v in options_dict.items() if k != 'correct answer'}
            options_formatted = "Options: " + json.dumps(options_filtered)
            
            # Construct prompt
            prompt = f"{introduction}\n\n{question}\n\n{options_formatted}"
            formatted_prompts.append(prompt)
        
        formatted_df['prompt'] = formatted_prompts
        return formatted_df
    
    # Apply prompt formatting to both datasets
    train_df = format_prompts(train_df)
    test_df = format_prompts(test_df)
    
    print(f"Created balanced samples with {n_per_class} records per class")
    print(f"Training set: {len(train_df)} records ({n_train_per_class} per class)")
    print(f"Test set: {len(test_df)} records ({n_per_class - n_train_per_class} per class)")
    print("\nClass distribution in training set:")
    print(train_df['hallucinated'].value_counts())
    print("\nClass distribution in test set:")
    print(test_df['hallucinated'].value_counts())
    
    return train_df, test_df

In [3]:
file_path = '/home/oliver/Downloads/fct_responses_clean.tsv'

# Read the TSV file
df = pd.read_csv(file_path, sep='\t')


In [13]:
print("total number of records:", len(df))
print("hallucinated:", df['hallucinated'].sum())
print("proportion hallucinated:",  df['hallucinated'].sum()/len(df))

total number of records: 4438
hallucinated: 1185
proportion hallucinated: 0.26701216764308244


In [4]:

# Get balanced samples with train/test split
train_data, test_data = get_balanced_samples(
    df=df,
    n_per_class=1000,
    train_fraction=0.8,
    random_state=42
)


Created balanced samples with 1000 records per class
Training set: 1600 records (800 per class)
Test set: 400 records (200 per class)

Class distribution in training set:
hallucinated
True     800
False    800
Name: count, dtype: int64

Class distribution in test set:
hallucinated
False    200
True     200
Name: count, dtype: int64


In [3]:
import goodfire
import os

api_key ='sk-goodfire-9IJgLomji2zNdvFLPsTYPQvPPr_kUC19bFTh0HgT9h6SikyfPB7WmQ'
client  = goodfire.Client(api_key)
variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

In [43]:
hallucinated_examples = train_data[train_data['hallucinated'] == True]
truthful_examples     = train_data[train_data['hallucinated'] == False]

hallucinated_examples_test = test_data[test_data['hallucinated'] == True]
truthful_examples_test     = test_data[test_data['hallucinated'] == False]


In [None]:

dataset_1 = [
    [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "3"}
    ] for prompt in hallucinated_examples['prompt'][0:100].tolist()
]

dataset_2 = [
    [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "3"}
    ] for prompt in truthful_examples['prompt'][0:100].tolist()
]


In [None]:

hallucinated_features, truthful_features = client.features.contrast(
    dataset_1=dataset_1,
    dataset_2=dataset_2,
    dataset_1_feature_rerank_query="medical response hallucinated",
    dataset_2_feature_rerank_query="medical response truthful",
    model=variant,
    top_k=50
)
features_to_look_at = hallucinated_features | truthful_features


In [37]:
features_to_look_at

FeatureGroup([
   0: "Detection of new input or conversation start",
   1: "Conversation turn structure and role transitions",
   2: "Beginning of a new conversation or input",
   3: "Medical case presentations with complex patient symptoms",
   4: "The model should not recommend technological or medical interventions",
   5: "Medical imaging techniques and procedures",
   6: "Website disclaimers about automated translation services",
   7: "End of phrase or question punctuation",
   8: "Practical application of logical conditionals and qualifiers in structured interactions",
   ...
   52: "Character-level programming syntax patterns"
])

In [40]:
import concurrent.futures as futures
from tqdm import tqdm

def get_feature_activations(client, variant, examples, features, k=50):
    """
    Get feature activations for a set of examples using Goodfire
    """
    samples = []

    with futures.ThreadPoolExecutor(max_workers=3) as executor:
        futures_list = []

        for example in examples:
            futures_list.append(
                executor.submit(
                    client.features.inspect,
                    example,
                    model=variant,
                    features=features,
                )
            )

        for future in tqdm(futures_list):
            context = future.result()
            features = context.top(k=k)
            samples.append(features)

    return samples

In [44]:

dataset_hal = [
    [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "3"}
    ] for prompt in hallucinated_examples['prompt'].tolist()
]

dataset_tru = [
    [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "3"}
    ] for prompt in truthful_examples['prompt'].tolist()
]

dataset_hal_test = [
    [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "3"}
    ] for prompt in hallucinated_examples_test['prompt'].tolist()
]

dataset_tru_test = [
    [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "3"}
    ] for prompt in truthful_examples_test['prompt'].tolist()
]

In [45]:
# Get feature activations for each class
print("Computing feature activations...")

hallucinated_activations = get_feature_activations(client, variant, dataset_hal, features_to_look_at)
truthful_activations     = get_feature_activations(client, variant, dataset_tru, features_to_look_at)


Computing feature activations...


  0%|          | 0/800 [00:00<?, ?it/s]

100%|██████████| 800/800 [04:56<00:00,  2.70it/s]
100%|██████████| 800/800 [04:40<00:00,  2.85it/s]


In [46]:
hallucinated_activations_test = get_feature_activations(client, variant, dataset_hal_test, features_to_look_at)
truthful_activations_test     = get_feature_activations(client, variant, dataset_tru_test, features_to_look_at)

100%|██████████| 200/200 [01:16<00:00,  2.61it/s]
100%|██████████| 200/200 [01:11<00:00,  2.81it/s]


In [47]:
def prepare_feature_matrix(feature_activations, features):
    """
    Convert feature activations into a matrix for training
    """
    def _select_feature_acts(features, row):
        output = []
        for feature in features:
            found = False
            for feature_act in row:
                if feature_act.feature.uuid == feature.uuid:
                    output.append(feature_act.activation)
                    found = True
                    break
            if not found:
                output.append(0.0)  # Default value if feature not found
        return output

    X = [_select_feature_acts(features, row) for row in feature_activations]
    return X

In [48]:
# Prepare feature matrix
X_hallucinate = prepare_feature_matrix(hallucinated_activations, features_to_look_at)
X_truthful    = prepare_feature_matrix(truthful_activations, features_to_look_at)

X_hallucinate_test = prepare_feature_matrix(hallucinated_activations_test, features_to_look_at)
X_truthful_test    = prepare_feature_matrix(truthful_activations_test, features_to_look_at)

# view example, we expect 50 features
print("Length of X_hallucinate = ", len(X_hallucinate))
print("Length of X_truthful  = ", len(X_truthful))
print("Example of X_hallucinate:\n ", X_hallucinate[0])
print("Len of example:\n ", len(X_hallucinate[0]))

Length of X_hallucinate =  800
Length of X_truthful  =  800
Example of X_hallucinate:
  [75.0, 87.0, 83.0, 0, 0, 0, 0, 0, 0.0, 0, 0.3743489583333333, 0, 1.109375, 0, 0, 0.50390625, 0, 0, 0.4147135416666667, 0.6178385416666666, 0, 0, 0.4599609375, 0, 0, 0.0, 0.291015625, 0, 0, 0, 0, 0, 0.447265625, 0, 0, 0, 0, 0, 0.2578125, 0, 0, 0, 0, 0, 0.267578125, 0.7265625, 0, 0, 0, 0, 0, 0, 0.0]
Len of example:
  53


# Compressibility

We have 100 samples of hallucinated and 100 truthful, but 53 predictors. Could overfit.

What is compressibility of this data?

In [49]:
import numpy as np
from typing import List, Dict, Tuple, NamedTuple
from dataclasses import dataclass

class DatasetStats(NamedTuple):
  position_variances: np.ndarray  # Variance at each position across all examples
  top_variant_positions: List[int]  # Indices of positions with highest variance
  position_activity: np.ndarray  # Percentage of non-zero values at each position
  mean_vector: np.ndarray  # Mean value at each position
  std_vector: np.ndarray  # Standard deviation at each position
  sparsity: float  # Overall sparsity of the dataset

def analyze_datasets(examples, n_top_positions = 5):
  """
  Analyze multiple examples simultaneously to find the most variant positions.

  Args:
      examples: List of examples, where each example is a list of float values
      n_top_positions: Number of top variant positions to identify

  Returns:
      DatasetStats containing analysis results
  """
  # Convert to numpy array for efficient computation
  data = np.array(examples)

  # Calculate variance at each position
  position_variances = np.var(data, axis=0)

  # Get indices of positions with highest variance
  top_variant_positions = np.argsort(position_variances)[-n_top_positions:].tolist()[::-1]

  # Calculate percentage of non-zero values at each position
  position_activity = np.mean(data != 0, axis=0) * 100

  # Calculate mean and std at each position
  mean_vector = np.mean(data, axis=0)
  std_vector = np.std(data, axis=0)

  # Calculate overall sparsity
  sparsity = np.mean(data == 0) * 100

  return DatasetStats(
      position_variances=position_variances,
      top_variant_positions=top_variant_positions,
      position_activity=position_activity,
      mean_vector=mean_vector,
      std_vector=std_vector,
      sparsity=sparsity
  )

def print_analysis_report(stats: DatasetStats, n_positions: int = 5):
  """
  Print a comprehensive analysis report.

  Args:
      stats: DatasetStats object containing analysis results
      n_positions: Number of top positions to show in detail
  """
  print(f"Dataset Analysis Report")
  print("=" * 50)
  print(f"\nOverall Statistics:")
  print(f"Sparsity: {stats.sparsity:.2f}% zeros")

  print(f"\nTop {n_positions} Most Variant Positions:")
  print("-" * 50)
  print(f"{'Position':^10} {'Variance':^12} {'Activity%':^12} {'Mean':^12} {'Std':^12}")
  print("-" * 50)

  for pos in stats.top_variant_positions[:n_positions]:
      print(f"{pos:^10} {stats.position_variances[pos]:^12.4f} "
            f"{stats.position_activity[pos]:^12.2f} "
            f"{stats.mean_vector[pos]:^12.4f} "
            f"{stats.std_vector[pos]:^12.4f}")

In [50]:
# Combine Data, predictors (x) and targets (y)
import random

print(f"There are {len(X_hallucinate)} examples for 'Hallucinated'")
print(f"There are {len(X_truthful)} examples for 'Truthful'")

X = X_hallucinate + X_truthful 
y = ([1] * len(X_hallucinate)) + ([0] * len(X_truthful))  

X_test = X_hallucinate_test + X_truthful_test
y_test = ([1] * len(X_hallucinate_test)) + ([0] * len(X_truthful_test))  

assert len(X) == len(y)

print("Therefore...")
print("Total length of X:", len(X))
print("Total length of y:", len(y))
print("\n")

# Let's view a random sample
indices = random.sample(range(len(X)), 3)
print("Some random examples")
for i in indices:
  print(f"Element {i}:")
  print("   X:", X[i])
  print("   y:", y[i])
import numpy

There are 800 examples for 'Hallucinated'
There are 800 examples for 'Truthful'
Therefore...
Total length of X: 1600
Total length of y: 1600


Some random examples
Element 1248:
   X: [75.5, 87.5, 83.5, 0, 0, 1.964111328125, 0, 0, 0.0, 0, 0, 0, 1.1171875, 0.3291015625, 0.302734375, 0.701171875, 0, 0, 0, 0.49609375, 0.0, 0, 1.1328125, 0, 0, 1.765625, 0.4912109375, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.453125, 0, 0, 0.57568359375, 0, 0, 0, 0.3828125, 0, 0.3359375, 0.0]
   y: 0
Element 1471:
   X: [75.5, 87.5, 83.5, 0.40625, 0, 0.431640625, 0, 1.32421875, 0.0, 0, 0, 0, 0.96484375, 0, 0.2900390625, 0.46875, 0, 0, 0.5064174107142857, 0.28515625, 0, 0, 0.5455729166666666, 0, 0, 0.0, 0, 0, 0, 0, 0, 0, 0.63671875, 0, 0, 0, 0, 0, 0.837890625, 0, 0, 0, 0.478515625, 0, 0, 0.62109375, 0, 0.3193359375, 0, 0, 0, 1.3118489583333333, 0.0]
   y: 0
Element 602:
   X: [75.0, 87.0, 83.0, 0, 0, 0, 0, 1.7265625, 0.0, 0, 0, 0, 1.109375, 0, 0, 0.34765625, 0, 0, 0, 2.03125, 0, 0, 0, 0, 0, 0.0, 0, 0, 0

In [57]:
import numpy as np

np.save('X.npy', np.array(X))
np.save('X_test.npy', np.array(X_test))
np.save('y.npy', np.array(y))
np.save('y_test.npy', np.array(y_test))

In [66]:

# Analyse compressibility
n_top_positions = 20
stats = analyze_datasets(X, n_top_positions)
print_analysis_report(stats, n_top_positions)

Dataset Analysis Report

Overall Statistics:
Sparsity: 74.70% zeros

Top 20 Most Variant Positions:
--------------------------------------------------
 Position    Variance    Activity%       Mean         Std     
--------------------------------------------------
    1        17.1876       100.00      88.1520       4.1458   
    2        16.7700       100.00      84.1412       4.0951   
    0        13.4042       100.00      76.0342       3.6612   
    25        0.4686       11.38        0.2272       0.6845   
    5         0.3682       27.06        0.3016       0.6068   
    7         0.2894       30.00        0.3188       0.5380   
    19        0.2287       42.94        0.3467       0.4782   
    22        0.1996       51.31        0.3844       0.4468   
    27        0.1673       12.62        0.1329       0.4090   
    51        0.1642       21.31        0.1862       0.4052   
    29        0.1488       21.31        0.1700       0.3858   
    18        0.1482       68.31        0.

In [67]:
# get important locations in data
X_compressed = [[x[i] for i in stats.top_variant_positions[0:20]] for x in X]
X_compressed_test = [[x[i] for i in stats.top_variant_positions[0:20]] for x in X_test]
# view example
print("X:\n", X_compressed[0])
print("Y:\n", y[0])


X:
 [87.0, 83.0, 75.0, 0.0, 0, 0, 0.6178385416666666, 0.4599609375, 0, 0, 0, 0.4147135416666667, 0.50390625, 0.447265625, 0.2578125, 1.109375, 0.291015625, 0, 0.7265625, 0]
Y:
 1


## Train the Decision Tree

For speed we'll sub divide the training set into train and test, then I can use objects already created...

Lazy, I know, but in a hurry here...and justtrying to explore the territory...

In [53]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score
import pandas as pd

def train_tree(X, y, depth):
    """
    Train a decision tree classifier
    """
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state=42)

    model = tree.DecisionTreeClassifier(
        max_depth=depth,
        min_samples_leaf=len(train_x) // 20,
        random_state=42
    )

    model.fit(train_x, train_y)
    pred = model.predict(test_x)

    accuracy = balanced_accuracy_score(test_y, pred)
    score = f1_score(test_y, pred, average='weighted')

    return model, pred, score, accuracy, (train_x, test_x, train_y, test_y)

In [68]:
# Train model on original data
print("Training decision tree...")
tree_raw, pred, score, accuracy, splits = train_tree(X, y, depth=6)

print(f"Balanced Accuracy: {accuracy:.3f}")
print(f"F1 Score: {score:.3f}")

Training decision tree...
Balanced Accuracy: 0.575
F1 Score: 0.575


In [69]:
# Train model on compressed data
print("Training decision tree...")
tree_compressed, pred, score, accuracy, splits = train_tree(X_compressed, y, depth=6)

print(f"Balanced Accuracy: {accuracy:.3f}")
print(f"F1 Score: {score:.3f}")

Training decision tree...
Balanced Accuracy: 0.590
F1 Score: 0.592


## Train SVM

In [61]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix, 
                           balanced_accuracy_score, f1_score)
from sklearn.pipeline import Pipeline

def train_evaluate_svm(X, y):
    """
    Train and evaluate SVM classifier with balanced accuracy and F1 scores
    
    Args:
    X: List of lists where each inner list has 53 float values
    y: List of integers (1 for Hallucinated, 0 for truthful)
    """
    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # Create pipeline with preprocessing and model
    svm_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(
            kernel='rbf',
            C=1.0,
            random_state=42,
            probability=True
        ))
    ])
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train the model
    svm_pipeline.fit(X_train, y_train)
    
    # Get predictions for both training and test sets
    y_train_pred = svm_pipeline.predict(X_train)
    y_test_pred = svm_pipeline.predict(X_test)
    
    # Calculate metrics for training set
    train_balanced_acc = balanced_accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    # Calculate metrics for test set
    test_balanced_acc = balanced_accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    # Perform cross-validation
    cv_scores = cross_val_score(svm_pipeline, X_train, y_train, cv=5)
    
    # Print comprehensive performance metrics
    print("\nModel Performance:")
    print("-----------------")
    print(f"Number of training examples: {len(X_train)}")
    print(f"Number of test examples: {len(X_test)}")
    
    print(f"\nClass distribution in training:")
    print(f"Truthful (0): {sum(y_train == 0)}")
    print(f"Hallucinated (1): {sum(y_train == 1)}")
    
    print("\nTraining Set Metrics:")
    print(f"Balanced Accuracy: {train_balanced_acc:.3f}")
    print(f"F1 Score: {train_f1:.3f}")
    
    print("\nTest Set Metrics:")
    print(f"Balanced Accuracy: {test_balanced_acc:.3f}")
    print(f"F1 Score: {test_f1:.3f}")
    
    print("\nCross-validation scores:", cv_scores)
    print("Average CV score: {:.3f} (+/- {:.3f})".format(
        cv_scores.mean(), cv_scores.std() * 2
    ))
    
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, y_test_pred, 
                              target_names=['Truthful', 'Hallucinated']))
    
    print("\nConfusion Matrix (Test Set):")
    print(confusion_matrix(y_test, y_test_pred))
    
    # Return model and metrics dictionary
    metrics = {
        'train_balanced_accuracy': train_balanced_acc,
        'train_f1': train_f1,
        'test_balanced_accuracy': test_balanced_acc,
        'test_f1': test_f1,
        'cv_scores_mean': cv_scores.mean(),
        'cv_scores_std': cv_scores.std()
    }
    
    return svm_pipeline, metrics

def predict_new_examples(model, X_new):
    """
    Predict classes for new examples
    
    Args:
    model: Trained pipeline
    X_new: List of lists, each inner list having 53 float values
    
    Returns:
    predictions: Array of predicted labels (0 or 1)
    probabilities: Array of prediction probabilities for each class
    """
    X_new = np.array(X_new)
    predictions = model.predict(X_new)
    probabilities = model.predict_proba(X_new)
    
    return predictions, probabilities


In [70]:
svm_raw, metrics = train_evaluate_svm(X, y)

# Access specific metrics
print("\nSummary Metrics Dictionary:")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.3f}")



Model Performance:
-----------------
Number of training examples: 1280
Number of test examples: 320

Class distribution in training:
Truthful (0): 640
Hallucinated (1): 640

Training Set Metrics:
Balanced Accuracy: 0.794
F1 Score: 0.808

Test Set Metrics:
Balanced Accuracy: 0.566
F1 Score: 0.585

Cross-validation scores: [0.609375   0.62890625 0.62890625 0.59765625 0.58203125]
Average CV score: 0.609 (+/- 0.036)

Classification Report (Test Set):
              precision    recall  f1-score   support

    Truthful       0.57      0.52      0.54       160
Hallucinated       0.56      0.61      0.59       160

    accuracy                           0.57       320
   macro avg       0.57      0.57      0.56       320
weighted avg       0.57      0.57      0.56       320


Confusion Matrix (Test Set):
[[83 77]
 [62 98]]

Summary Metrics Dictionary:
train_balanced_accuracy: 0.794
train_f1: 0.808
test_balanced_accuracy: 0.566
test_f1: 0.585
cv_scores_mean: 0.609
cv_scores_std: 0.018


In [71]:
svm_raw_compressed, metrics = train_evaluate_svm(X_compressed, y)

# Access specific metrics
print("\nSummary Metrics Dictionary:")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.3f}")



Model Performance:
-----------------
Number of training examples: 1280
Number of test examples: 320

Class distribution in training:
Truthful (0): 640
Hallucinated (1): 640

Training Set Metrics:
Balanced Accuracy: 0.684
F1 Score: 0.706

Test Set Metrics:
Balanced Accuracy: 0.584
F1 Score: 0.596

Cross-validation scores: [0.57421875 0.58984375 0.58203125 0.58984375 0.55859375]
Average CV score: 0.579 (+/- 0.023)

Classification Report (Test Set):
              precision    recall  f1-score   support

    Truthful       0.59      0.56      0.57       160
Hallucinated       0.58      0.61      0.60       160

    accuracy                           0.58       320
   macro avg       0.58      0.58      0.58       320
weighted avg       0.58      0.58      0.58       320


Confusion Matrix (Test Set):
[[89 71]
 [62 98]]

Summary Metrics Dictionary:
train_balanced_accuracy: 0.684
train_f1: 0.706
test_balanced_accuracy: 0.584
test_f1: 0.596
cv_scores_mean: 0.579
cv_scores_std: 0.012


 We'll use the raw model, not compressed, easier to work with features when model loaded from file

In [76]:
import pickle

def save_model_and_features(model, features, output_path: str):
    """Save both the sklearn decision tree model and Goodfire features to a file.
    
    Args:
        model: The trained sklearn decision tree model
        features: The Goodfire features used by the model
        output_path: Path where to save the pickle file
    """
    model_data = {
        'model': model,
        'features': features
    }
    with open(output_path, 'wb') as f:
        pickle.dump(model_data, f)


In [80]:

# Save the model and best_features

model_path = "hallucination_classifier_svm.pkl"
save_model_and_features(svm_raw, features_to_look_at, model_path)

In [78]:
import pickle
import goodfire
from typing import List, Dict, Tuple, Any
import numpy as np
import sklearn

class SVMHallucinationClassifier:
    def __init__(self, model_path: str, api_key: str):
        """
        Initialize the hallucination classifier with a saved SVM model and features.
        
        Args:
            model_path: Path to the saved pickle file containing both the model and features
            api_key: Goodfire API key for accessing the service
        """
        # Load the model and features
        with open(model_path, 'rb') as f:
            model_data = pickle.load(f)
            self.model = model_data['model']
            self.features = model_data['features']
        self.client = goodfire.Client(api_key)
        self.variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

    def _format_prompt(self, question: str) -> List[Dict[str, str]]:
        """Format a question into the expected prompt structure."""
        introduction = ("You are a medical expert and this is a multiple choice exam question. "
                      "Please respond with the integer index of the CORRECT answer only; [0,1,2,3].")
        return [{"role": "user", "content": f"{introduction}\n\n{question}"}]

    def _get_feature_activations(self, prompt: List[Dict[str, str]]) -> List[float]:
        """Get feature activations for the input prompt."""
        context = self.client.features.inspect(
            prompt,
            model=self.variant,
            features=self.features
        )
        
        # Get activations for our specific features
        activations = []
        features_dict = {f.uuid: 0.0 for f in self.features}
        
        for feature_act in context.top(k=len(self.features)):
            if feature_act.feature.uuid in features_dict:
                features_dict[feature_act.feature.uuid] = feature_act.activation
        
        # Maintain order matching the original features
        for feature in self.features:
            activations.append(features_dict[feature.uuid])
            
        return activations

    def predict(self, question: str, debug: bool = False) -> Tuple[int, float]:
        """
        Predict whether a given question-answer pair is likely to contain hallucination.
        
        Args:
            question: The question text
            debug: If True, print debugging information about feature activations
            
        Returns:
            Tuple containing:
            - Prediction (0 for truthful, 1 for hallucinated)
            - Confidence score (probability of the predicted class)
        """
        # Format the prompt
        prompt = self._format_prompt(question)
        
        # Get feature activations
        activations = self._get_feature_activations(prompt)
        
        if debug:
            print("\nFeature Activations:")
            for feature, activation in zip(self.features, activations):
                print(f"{feature.label}: {activation:.4f}")
            
            # For SVM, we can show feature importance through the absolute values of coefficients
            # Note: This only works for linear SVM. For non-linear kernels, feature importance
            # cannot be directly computed from the model coefficients
            if hasattr(self.model, 'coef_'):
                print("\nFeature Importance in Model (based on absolute coefficient values):")
                feature_importance = np.abs(self.model.coef_[0])
                for feature, importance in zip(self.features, feature_importance):
                    print(f"{feature.label}: {importance:.4f}")
            
            # For SVM, we can show the distance from the decision boundary
            decision_function = self.model.decision_function([activations])[0]
            print(f"\nDistance from decision boundary: {decision_function:.4f}")
            
        # Make prediction
        prediction = self.model.predict([activations])[0]
        probabilities = self.model.predict_proba([activations])[0]
        confidence = probabilities[prediction]
        
        if debug:
            print(f"\nProbabilities:")
            print(f"Truthful: {probabilities[0]:.4f}")
            print(f"Hallucinated: {probabilities[1]:.4f}")
        
        return int(prediction), float(confidence)

In [88]:
model_path = "../classifier/hallucination_classifier_svm.pkl"

prompt_example = "this is my prompt"

classifier = SVMHallucinationClassifier(
    model_path=model_path,
    api_key=api_key
)

# get prediction
# prediction = 1 indicates hallucinated
# prediction = 0 indicates truthful
prediction, confidence = classifier.predict(prompt_example, debug=True)


Feature Activations:
Detection of new input or conversation start: 75.0000
Conversation turn structure and role transitions: 87.0000
Beginning of a new conversation or input: 83.0000
Medical case presentations with complex patient symptoms: 0.0000
The model should not recommend technological or medical interventions: 0.0000
Medical imaging techniques and procedures: 0.0000
Website disclaimers about automated translation services: 0.0000
End of phrase or question punctuation: 0.0000
Practical application of logical conditionals and qualifiers in structured interactions: 0.0000
Technical and scientific details in explanations or descriptions: 0.0000
References to structured eligibility criteria and specific conditions: 0.0000
The user is asking about human qualities or experiences: 0.0000
Structured Assessments and Evaluations: 0.6712
Connective words in technical explanations: 0.0000
Precise measurements and specific details: 0.0000
The model is providing multiple choice answer options

In [82]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def evaluate_model_predictions(classifier, truthful_examples, hallucinated_examples):
    """
    Evaluate model predictions across all examples.
    
    Args:
        classifier: The HallucinationClassifier instance
        truthful_examples: DataFrame containing truthful examples
        hallucinated_examples: DataFrame containing hallucinated examples
    """
    # Store results
    results = []
    
    # Process truthful examples
    print("\nProcessing truthful examples...")
    for idx, row in tqdm(truthful_examples.iterrows(), total=len(truthful_examples)):
        prediction, confidence = classifier.predict(row['prompt'])
        results.append({
            'true_label': 'truthful',
            'predicted': 'hallucinated' if prediction == 1 else 'truthful',
            'confidence': confidence
        })
    
    # Process hallucinated examples
    print("\nProcessing hallucinated examples...")
    for idx, row in tqdm(hallucinated_examples.iterrows(), total=len(hallucinated_examples)):
        prediction, confidence = classifier.predict(row['prompt'])
        results.append({
            'true_label': 'hallucinated',
            'predicted': 'hallucinated' if prediction == 1 else 'truthful',
            'confidence': confidence
        })
    
    # Convert to DataFrame for analysis
    results_df = pd.DataFrame(results)
    
    # Calculate overall statistics
    total_predictions = len(results_df)
    print("\nOverall Statistics:")
    print(f"Total examples evaluated: {total_predictions}")
    
    # Prediction distribution
    pred_dist = results_df['predicted'].value_counts()
    print("\nPrediction Distribution:")
    for pred, count in pred_dist.items():
        percentage = (count/total_predictions) * 100
        print(f"{pred}: {count} ({percentage:.1f}%)")
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    confusion = pd.crosstab(results_df['true_label'], results_df['predicted'])
    print(confusion)
    
    # Calculate metrics by class
    print("\nMetrics by True Label:")
    for label in ['truthful', 'hallucinated']:
        class_results = results_df[results_df['true_label'] == label]
        correct = (class_results['true_label'] == class_results['predicted']).sum()
        total = len(class_results)
        accuracy = (correct/total) * 100
        avg_confidence = class_results['confidence'].mean()
        
        print(f"\n{label.title()} Examples:")
        print(f"Accuracy: {accuracy:.1f}%")
        print(f"Average Confidence: {avg_confidence:.3f}")
    
    # Return the results DataFrame for further analysis if needed
    return results_df


In [85]:
## TRAINING DATASET EVALUATION

# Assuming classifier is already initialized
results_df = evaluate_model_predictions(classifier, truthful_examples_test[0:30], hallucinated_examples_test[0:30])

# You can do additional analysis on results_df if needed
# For example, look at high confidence mistakes:
high_conf_mistakes = results_df[
    (results_df['true_label'] != results_df['predicted']) & 
    (results_df['confidence'] > 0.8)
]


Processing truthful examples...


100%|██████████| 30/30 [01:29<00:00,  2.97s/it]



Processing hallucinated examples...


100%|██████████| 30/30 [01:28<00:00,  2.96s/it]


Overall Statistics:
Total examples evaluated: 60

Prediction Distribution:
hallucinated: 34 (56.7%)
truthful: 26 (43.3%)

Confusion Matrix:
predicted     hallucinated  truthful
true_label                          
hallucinated            20        10
truthful                14        16

Metrics by True Label:

Truthful Examples:
Accuracy: 53.3%
Average Confidence: 0.585

Hallucinated Examples:
Accuracy: 66.7%
Average Confidence: 0.572



