# Get Huggingface Data

In [1]:
# install Huggingface datasets
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
##################################################
# EITAN
# This is the first piece of code you need to connect to the pubmed_qa data
# for multiple choice medical questions, where NOTA can be added.
#
##################################################

# Acquire the PubMedQA Data
# Source:
# https://huggingface.co/datasets/bigbio/pubmed_qa

from datasets import load_dataset

pqal0 = load_dataset("bigbio/pubmed_qa", "pubmed_qa_labeled_fold0_source")
train_data = pqal0['train']

# see example
train_data[0]

README.md:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

pubmed_qa.py:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

bigbiohub.py:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

The repository for bigbio/pubmed_qa contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigbio/pubmed_qa.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


pqal.zip:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'QUESTION': 'Does histologic chorioamnionitis correspond to clinical chorioamnionitis?',
 'CONTEXTS': ['To evaluate the degree to which histologic chorioamnionitis, a frequent finding in placentas submitted for histopathologic evaluation, correlates with clinical indicators of infection in the mother.',
  'A retrospective review was performed on 52 cases with a histologic diagnosis of acute chorioamnionitis from 2,051 deliveries at University Hospital, Newark, from January 2003 to July 2003. Third-trimester placentas without histologic chorioamnionitis (n = 52) served as controls. Cases and controls were selected sequentially. Maternal medical records were reviewed for indicators of maternal infection.',
  'Histologic chorioamnionitis was significantly associated with the usage of antibiotics (p = 0.0095) and a higher mean white blood cell count (p = 0.018). The presence of 1 or more clinical indicators was significantly associated with the presence of histologic chorioamnionitis (p =

## Preprocess data

Aappending contexts to question

In [78]:
##################################################
#
# Eitan: ... and this
#
##################################################


def prepare_pubmed_data(train_data, sample_size=100):
    """
    Prepare PubMedQA data for Goodfire analysis
    """
    # Separate data by answer type
    yes_examples = []
    no_examples = []
    maybe_examples = []

    for example in train_data:

        combined_context = ' '.join(example['CONTEXTS'])
        combined_question = example['QUESTION']+". Review the following context and answer with a single word from this list: ['Yes', 'No', 'Maybe']. "

        msg = [
            {
                "role": "user",
                "content": f"Question: {combined_question}\nContext: {combined_context}"
            },
            {
                "role": "assistant",
                "content": example['final_decision']
            }
        ]

        if example['final_decision'].lower() == 'yes':
            yes_examples.append(msg)

        elif example['final_decision'].lower() == 'no':
            no_examples.append(msg)

        else:  # maybe
            maybe_examples.append(msg)

    # Trim to sample size
    yes_examples   = yes_examples[:sample_size]
    no_examples    = no_examples[:sample_size]
    maybe_examples = maybe_examples[:sample_size]

    return yes_examples, no_examples, maybe_examples

In [9]:
# Prepare data
sample_size = 100
print("Preparing data...")
yes_examples, no_examples, maybe_examples = prepare_pubmed_data(train_data, sample_size)

# view example
yes_examples[0]

Preparing data...


[{'role': 'user',
  'content': "Question: Does histologic chorioamnionitis correspond to clinical chorioamnionitis?. Review the following context and answer with a single word from this list: ['Yes', 'No', 'Maybe']. \nContext: To evaluate the degree to which histologic chorioamnionitis, a frequent finding in placentas submitted for histopathologic evaluation, correlates with clinical indicators of infection in the mother. A retrospective review was performed on 52 cases with a histologic diagnosis of acute chorioamnionitis from 2,051 deliveries at University Hospital, Newark, from January 2003 to July 2003. Third-trimester placentas without histologic chorioamnionitis (n = 52) served as controls. Cases and controls were selected sequentially. Maternal medical records were reviewed for indicators of maternal infection. Histologic chorioamnionitis was significantly associated with the usage of antibiotics (p = 0.0095) and a higher mean white blood cell count (p = 0.018). The presence of 

# Get Goodfire


In [10]:
##################################################
#
# Eithan, you don't need the rest of this...
# Its the classifier built my way, but ignore it, we'll use the standard code
#
##################################################


!pip install goodfire



## Extract features using Goodfire API

submit pubmedqa prompts, get top_k features.

Using top_k = 50, as per example at: https://docs.goodfire.ai/examples/decision_trees.html

In [76]:
import goodfire

def contrasting_features_yesno(client, variant, yes_examples, no_examples):
    """
    Use Goodfire to extract contrasting features between different answer types
    """
    print("Computing features for yes vs no...")
    yes_no_features, _ = client.features.contrast(
        dataset_1=yes_examples,
        dataset_2=no_examples,
        dataset_1_feature_rerank_query="medical findings supporting yes",
        dataset_2_feature_rerank_query="medical findings supporting no",
        model=variant,
        top_k=50
    )
    return yes_no_features

def contrasting_features_yesmaybe(client, variant, yes_examples, maybe_examples):
    """
    Use Goodfire to extract contrasting features between different answer types
    """
    # NOTE:

    print("Computing features for yes vs maybe...")
    yes_maybe_features, _ = client.features.contrast(
        dataset_1=yes_examples,
        dataset_2=maybe_examples,
        dataset_1_feature_rerank_query="clear medical evidence",
        dataset_2_feature_rerank_query="uncertain medical evidence",
        model=variant,
        top_k=50
    )
    return yes_maybe_features

In [17]:
from google.colab import userdata

# Get API key
api_key = userdata.get('GOODFIRE_API_KEY')

# Initialize Goodfire
client  = goodfire.Client(api_key)
variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

# Extract features
print("Extracting features...")
features = contrasting_features_yesno(client, variant, yes_examples, no_examples)

Extracting features...
Computing features for yes vs no...


In [77]:
features

FeatureGroup([
   0: "Toxicological studies and safety assessments in animal models",
   1: "Medical imaging techniques and procedures",
   2: "Diabetes as a medical condition",
   3: "Experimental design elements in medical and scientific studies",
   4: "Affirmative response or agreement in constrained dialogue",
   5: "References to structured eligibility criteria and specific conditions",
   6: "ADHD-related behaviors and symptoms",
   7: "Numerical comparisons and thresholds in financial/legal contexts",
   8: "Legal, medical, and technical terms ending in -ated or -ected",
   ...
   49: "Start of a new message or input in a conversation"
])

In [18]:
import concurrent.futures as futures
import tqdm

def get_feature_activations(client, variant, examples, features, k=50):
    """
    Get feature activations for a set of examples using Goodfire
    """
    samples = []

    with futures.ThreadPoolExecutor(max_workers=3) as executor:
        futures_list = []

        for example in examples:
            futures_list.append(
                executor.submit(
                    client.features.inspect,
                    example,
                    model=variant,
                    features=features,
                )
            )

        for future in tqdm.tqdm(futures_list):
            context = future.result()
            features = context.top(k=k)
            samples.append(features)

    return samples

In [19]:
# Get feature activations for each class
print("Computing feature activations...")

yes_activations   = get_feature_activations(client, variant, yes_examples, features)
no_activations    = get_feature_activations(client, variant, no_examples, features)
# maybe_activations = get_feature_activations(client, variant, maybe_examples, features)

Computing feature activations...


100%|██████████| 100/100 [00:44<00:00,  2.27it/s]
100%|██████████| 100/100 [00:47<00:00,  2.09it/s]


In [21]:
def prepare_feature_matrix(feature_activations, features):
    """
    Convert feature activations into a matrix for training
    """
    def _select_feature_acts(features, row):
        output = []
        for feature in features:
            found = False
            for feature_act in row:
                if feature_act.feature.uuid == feature.uuid:
                    output.append(feature_act.activation)
                    found = True
                    break
            if not found:
                output.append(0.0)  # Default value if feature not found
        return output

    X = [_select_feature_acts(features, row) for row in feature_activations]
    return X

In [24]:
# Prepare feature matrix
X_yes = prepare_feature_matrix(yes_activations, features)
X_no = prepare_feature_matrix(no_activations, features)
# X_maybe = prepare_feature_matrix(maybe_activations, features)

# view example, we expect 50 features
print("Length of X_yes = ", len(X_yes))
print("Length of X_no  = ", len(X_no))
print("Example of X_yes:\n ", X_yes[0])

Length of X_yes =  100
Length of X_no  =  100
Example of X_yes:
  [0, 0.811279296875, 0, 3.4091796875, 2.61328125, 0, 0, 0.427734375, 0, 0.2734375, 0.47021484375, 0.73828125, 0, 0, 0, 0.5657552083333334, 0, 0, 0, 0, 0, 0.3217075892857143, 0.95849609375, 0, 0, 0, 0, 0, 0.9083806818181818, 0, 0, 0, 0.394921875, 0, 0, 0.33984375, 0, 0, 0, 0, 0.298828125, 0, 0.263671875, 0, 0.7578125, 0, 0, 0, 0, 0]


# Compressibility

We have 100 samples of yes and 100 no, but 50 predictors. Could overfit.

What is compressibility of this data?



In [28]:
import numpy as np
from typing import List, Dict, Tuple, NamedTuple
from dataclasses import dataclass

class DatasetStats(NamedTuple):
  position_variances: np.ndarray  # Variance at each position across all examples
  top_variant_positions: List[int]  # Indices of positions with highest variance
  position_activity: np.ndarray  # Percentage of non-zero values at each position
  mean_vector: np.ndarray  # Mean value at each position
  std_vector: np.ndarray  # Standard deviation at each position
  sparsity: float  # Overall sparsity of the dataset

def analyze_datasets(examples, n_top_positions = 5):
  """
  Analyze multiple examples simultaneously to find the most variant positions.

  Args:
      examples: List of examples, where each example is a list of float values
      n_top_positions: Number of top variant positions to identify

  Returns:
      DatasetStats containing analysis results
  """
  # Convert to numpy array for efficient computation
  data = np.array(examples)

  # Calculate variance at each position
  position_variances = np.var(data, axis=0)

  # Get indices of positions with highest variance
  top_variant_positions = np.argsort(position_variances)[-n_top_positions:].tolist()[::-1]

  # Calculate percentage of non-zero values at each position
  position_activity = np.mean(data != 0, axis=0) * 100

  # Calculate mean and std at each position
  mean_vector = np.mean(data, axis=0)
  std_vector = np.std(data, axis=0)

  # Calculate overall sparsity
  sparsity = np.mean(data == 0) * 100

  return DatasetStats(
      position_variances=position_variances,
      top_variant_positions=top_variant_positions,
      position_activity=position_activity,
      mean_vector=mean_vector,
      std_vector=std_vector,
      sparsity=sparsity
  )

def print_analysis_report(stats: DatasetStats, n_positions: int = 5):
  """
  Print a comprehensive analysis report.

  Args:
      stats: DatasetStats object containing analysis results
      n_positions: Number of top positions to show in detail
  """
  print(f"Dataset Analysis Report")
  print("=" * 50)
  print(f"\nOverall Statistics:")
  print(f"Sparsity: {stats.sparsity:.2f}% zeros")

  print(f"\nTop {n_positions} Most Variant Positions:")
  print("-" * 50)
  print(f"{'Position':^10} {'Variance':^12} {'Activity%':^12} {'Mean':^12} {'Std':^12}")
  print("-" * 50)

  for pos in stats.top_variant_positions[:n_positions]:
      print(f"{pos:^10} {stats.position_variances[pos]:^12.4f} "
            f"{stats.position_activity[pos]:^12.2f} "
            f"{stats.mean_vector[pos]:^12.4f} "
            f"{stats.std_vector[pos]:^12.4f}")

In [66]:
# Combine Data, predictors (x) and targets (y)
import random

print(f"There are {len(X_yes)} examples for 'Yes'")
print(f"There are {len(X_no)} examples for 'No'")

X = X_yes + X_no #+ X_maybe
y = ([1] * len(X_yes)) + ([0] * len(X_no))  # yes=1, no=0
#y = ([2] * len(X_yes)) + ([0] * len(X_no)) + ([1] * len(X_maybe))  # yes=2, no=0, maybe=1
assert len(X) == len(y)

print("Therefore...")
print("Total length of X:", len(X))
print("Total length of y:", len(y))
print("\n")

# Let's view a random sample
indices = random.sample(range(len(X)), 3)
print("Some random examples")
for i in indices:
  print(f"Element {i}:")
  print("   X:", X[i])
  print("   y:", y[i])


There are 100 examples for 'Yes'
There are 100 examples for 'No'
Therefore...
Total length of X: 200
Total length of y: 200


Some random examples
Element 87:
   X: [0, 0.9619140625, 0, 1.6256917317708333, 2.31640625, 0.34765625, 0.2734375, 0.40234375, 1.984375, 0.3324652777777778, 0.548828125, 0, 0.51953125, 0, 0.29296875, 0.37327398255813954, 0.337890625, 0, 0, 0.4267578125, 0, 0.267578125, 0.5700334821428571, 0.3078125, 0, 0, 0, 0, 1.1246481948757765, 0.5631510416666666, 0, 0, 0.3439670138888889, 0.26318359375, 0.3623046875, 0.3854166666666667, 0.4388020833333333, 0, 0, 0, 0.32421875, 0, 0, 0, 1.0546875, 0, 0, 0, 0, 0]
   y: 1
Element 101:
   X: [0, 0.4296875, 0, 0.33984375, 2.8125, 0, 0.345703125, 0.361328125, 0, 0, 0, 0, 0, 0, 0, 0, 0.455078125, 0, 0, 0, 0.3857421875, 0.375, 0.583984375, 0, 0.2734375, 0, 0, 0.265625, 0.7085238821138211, 0, 0, 0, 0.3784877232142857, 0, 0.344921875, 0.306640625, 0, 0, 0, 0, 0.3564453125, 0, 0, 0, 1.0625, 0, 0, 0, 0.33203125, 0]
   y: 0
Element 84:
 

In [53]:

# Analyse compressibility
n_top_positions = 10
stats = analyze_datasets(X, n_top_positions)
print_analysis_report(stats, n_top_positions)

Dataset Analysis Report

Overall Statistics:
Sparsity: 62.53% zeros

Top 10 Most Variant Positions:
--------------------------------------------------
 Position    Variance    Activity%       Mean         Std     
--------------------------------------------------
    2         0.4545       13.50        0.1976       0.6741   
    3         0.3681       85.50        0.8272       0.6067   
    8         0.3612       58.00        0.5206       0.6010   
    12        0.3437       51.50        0.4810       0.5862   
    1         0.2874       73.50        0.6172       0.5361   
    11        0.1700       67.50        0.4831       0.4123   
    38        0.1523       27.00        0.1957       0.3903   
    23        0.0977       32.50        0.1871       0.3126   
    4         0.0894       100.00       2.7182       0.2990   
    10        0.0884       80.00        0.4673       0.2974   


In [72]:
# get important locations in data
X_compressed = [[x[i] for i in stats.top_variant_positions[0:1]] for x in X]

# view example
print("X:\n", X_compressed[0])
print("Y:\n", y[0])


X:
 [0]
Y:
 1


## Train the Decision Tree

For speed we'll sub divide the training set into train and test, then I can use objects already created...

Lazy, I know, but in a hurry here...and justtrying to explore the territory...

In [73]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score
import pandas as pd

def train_tree(X, y, depth):
    """
    Train a decision tree classifier
    """
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state=42)

    model = tree.DecisionTreeClassifier(
        max_depth=depth,
        min_samples_leaf=len(train_x) // 20,
        random_state=42
    )

    model.fit(train_x, train_y)
    pred = model.predict(test_x)

    accuracy = balanced_accuracy_score(test_y, pred)
    score = f1_score(test_y, pred, average='weighted')

    return model, pred, score, accuracy, (train_x, test_x, train_y, test_y)

In [74]:
# Train model on original data
print("Training decision tree...")
model, pred, score, accuracy, splits = train_tree(X, y, depth=3)

print(f"Balanced Accuracy: {accuracy:.3f}")
print(f"F1 Score: {score:.3f}")

Training decision tree...
Balanced Accuracy: 1.000
F1 Score: 1.000


In [75]:
# Train model on compressed data
print("Training decision tree...")
model, pred, score, accuracy, splits = train_tree(X_compressed, y, depth=3)

print(f"Balanced Accuracy: {accuracy:.3f}")
print(f"F1 Score: {score:.3f}")

Training decision tree...
Balanced Accuracy: 0.445
F1 Score: 0.318


## Visualise

As per goodfire example

In [None]:
# Create visualisation
import graphviz

feature_names = [feature.label for feature in features]
dot_data = tree.export_graphviz(
    model,
    out_file=None,
    feature_names=feature_names,
    class_names=['no', 'maybe', 'yes'],
    filled=True,
    rounded=True,
    special_characters=True
)
viz = graphviz.Source(dot_data)