## chainging attention mechanism

In [None]:
pip install transformers



In [None]:
!git clone https://github.com/huggingface/transformers.git
%cd transformers

Cloning into 'transformers'...
remote: Enumerating objects: 242277, done.[K
remote: Counting objects: 100% (22120/22120), done.[K
remote: Compressing objects: 100% (1397/1397), done.[K
remote: Total 242277 (delta 21663), reused 20797 (delta 20687), pack-reused 220157 (from 1)[K
Receiving objects: 100% (242277/242277), 246.81 MiB | 26.68 MiB/s, done.
Resolving deltas: 100% (178173/178173), done.
/content/transformers


In [None]:
!src/transformers/models/llama/

/bin/bash: line 1: src/transformers/models/llama/: Is a directory


In [None]:
import sys
sys.path.append('/content/transformers')

In [None]:
from transformers.models.llama.modeling_llama import LlamaModel, LlamaConfig,LlamaRotaryEmbedding,apply_rotary_pos_emb,LlamaRMSNorm,LlamaMLP,Cache
from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding

In [None]:
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
from transformers.modeling_utils import PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS



In [None]:
class Llama_C_Attention(nn.Module):
    """Multi-headed attention with Gaussian bias integration."""

    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)

        # Fixed parameters for Gaussian bias
        self.decay_rate = 82.86
        self.alpha = 0.37

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        # Compute attention weights
        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        # Add Gaussian bias
        seq_len = query_states.size(-2)
        indices = torch.arange(seq_len, device=query_states.device)
        gaussian_bias = torch.exp(-torch.abs(indices[None, :] - indices[:, None]) * self.decay_rate)
        attn_weights = (1 - self.alpha) * attn_weights + self.alpha * gaussian_bias

        # Add attention mask
        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


In [None]:
class Llama_C_DecoderLayer(nn.Module):
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        # Use the modified LlamaAttention directly
        self.self_attn = Llama_C_Attention(config=config, layer_idx=layer_idx)

        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:

        residual = hidden_states

        # Normalize the input
        hidden_states = self.input_layernorm(hidden_states)

        # Apply self-attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
            **kwargs,
        )

        # residual connection
        hidden_states = residual + hidden_states

        # Fully connected feed-forward network with layer normalization
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        # Prepare outputs
        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


In [None]:
class CustomLlamaModel(LlamaModel):
    def __init__(self, config):
        super().__init__(config)
        # Reconstruct the layers using the custom decoder layer
        self.layers = nn.ModuleList([
            Llama_C_DecoderLayer(config, i) for i in range(config.num_hidden_layers)
        ])

In [None]:
from transformers import PreTrainedModel
from torch import nn
from typing import Optional, Tuple, Union
from transformers.modeling_outputs import CausalLMOutputWithPast


class LlamaForCausalLM(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # using custom LlamaModel with modified decoder layers and attention
        self.model = CustomLlamaModel(config)

        # Language modeling head
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights
        self.post_init()

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        use_cache: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:

        # Get outputs from the custom LlamaModel
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            use_cache=use_cache,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Shift logits and labels for causal language modeling loss
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        if not self.config.use_return_dict:
            return ((loss,) + (logits,) + outputs[1:]) if loss is not None else (logits,) + outputs[1:]

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
        )


## Setup





In [None]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!pip install --upgrade pyarrow datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
!pip uninstall pyarrow
!pip install pyarrow

Found existing installation: pyarrow 18.1.0
Uninstalling pyarrow-18.1.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/pyarrow-18.1.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/pyarrow/*
Proceed (Y/n)? y
  Successfully uninstalled pyarrow-18.1.0
Collecting pyarrow
  Using cached pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Using cached pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.1 MB)
Installing collected packages: pyarrow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.10.1 requires pyarrow<18.0.0a0,>=14.0.0, but you have pyarrow 18.1.0 which is incompatible.
pylibcudf-cu12 24.10.1 requires pyarrow<18.0.0a0,>=14.0.0, but you have pyarrow 18.1.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-18.1.0


In [None]:
from datasets import Dataset
import re
from transformers import GPT2Tokenizer
from transformers import GPT2Config, GPT2LMHeadModel
import torch
from transformers import DataCollatorForLanguageModeling
from huggingface_hub import notebook_login
from transformers import Trainer, TrainingArguments
import subprocess


In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoConfig

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("babylm/babyllama-100m-2024")

# Load the model configuration without pretrained weights
config = AutoConfig.from_pretrained("babylm/babyllama-100m-2024")

# Instantiate the custom model
model = LlamaForCausalLM(config)


In [None]:
model

LlamaForCausalLM(
  (model): CustomLlamaModel(
    (embed_tokens): Embedding(16000, 512, padding_idx=0)
    (layers): ModuleList(
      (0-15): 16 x Llama_C_DecoderLayer(
        (self_attn): Llama_C_Attention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1024, bias=False)
          (up_proj): Linear(in_features=512, out_features=1024, bias=False)
          (down_proj): Linear(in_features=1024, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((512,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((512,), eps=1e-06)
      )
    )
 

In [None]:
config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "babylm/babyllama-100m-2024",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "max_position_embeddings": 256,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 16000
}

In [None]:
def preprocess_text(data):
    # Step 1: Remove text within parentheses
    data = re.sub(r'\([^)]*\)', '', data)

    # Step 2: Remove all-uppercase words (likely sound effects)
    data = re.sub(r'\b[A-Z]+\b', '', data)

    # Step 3: Convert to lowercase
    data = data.lower()

    # Step 4: Normalize spaces
    data = re.sub(r'\s+', ' ', data).strip()

    return data

# Load the data
file_path = '/content/drive/MyDrive/data/text_data.zip (Unzipped Files)/train_100M/open_subtitles.train'


with open(file_path, 'r') as file:
    raw_data = file.read()

# Apply preprocessing
cleaned_data = preprocess_text(raw_data)

# Display a sample of the cleaned data
print(cleaned_data[:500])


In [None]:
# Print the model size
model_size = sum(t.numel() for t in model.parameters())
print(f"babyllama size: {model_size/1000**2:.1f}M parameters")
# Check the maximum sequence length
print(f"Maximum input size for the model: {config.max_position_embeddings}")


58.343936

In [None]:
max_length = 256
input_texts = [cleaned_data[i:i+max_length] for i in range(0, len(cleaned_data), max_length)]

# Apply the tokenizer to each chunk
tokenized_data = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

# Display a sample of tokenized data
print(tokenized_data["input_ids"][:5])

In [None]:
from datasets import Dataset


dataset = Dataset.from_dict(tokenized_data)

# Split the dataset into train and test sets (e.g., 90% train, 10% test)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Training

In [None]:
training_args = TrainingArguments(
    output_dir='Harshatheeswar',
    hub_model_id='babylama-attentionchange_correct',
    evaluation_strategy='epoch',
    auto_find_batch_size=True,
    num_train_epochs=5,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type='cosine',
    learning_rate=5e-5,
    fp16=True,
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Push the trained model to the Hugging Face Hub
trainer.push_to_hub()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
0,4.3004,4.336871
1,4.0455,4.089211
2,3.9127,3.981016
3,3.8062,3.939464
4,3.7813,3.935231


events.out.tfevents.1732655951.b9dc60cf926e.1732.0:   0%|          | 0.00/598k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Harshatheeswar/babylama-attentionchange_correct/commit/8f79efe5cfaa558c1414d1a9e5a79f2b8d47ed2b', commit_message='End of training', commit_description='', oid='8f79efe5cfaa558c1414d1a9e5a79f2b8d47ed2b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Harshatheeswar/babylama-attentionchange_correct', endpoint='https://huggingface.co', repo_type='model', repo_id='Harshatheeswar/babylama-attentionchange_correct'), pr_revision=None, pr_num=None)

## evaluation

In [None]:
%cd /content/drive/MyDrive/new_evaluation_pipeline


In [None]:
!pip install -e
!pip install minicons
!pip install --upgrade accelerate

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install evaluate
!pip install sacrebleu
!apt-get install -y libnvinfer8 libnvinfer-plugin8
!apt-get install -y libnvparsers8 libnvonnxparsers8
!pip install sqlitedict
!pip install peft
!pip install pytablewriter

In [None]:
!bash eval_blimp.sh Harshatheeswar/babylama-attentionchange_correct

In [None]:
!bash eval_ewok.sh Harshatheeswar/babylama-attentionchange_correct

## Printing Samples

In [None]:
import os
import json

def load_jsonl_or_json_array(file_path, num_samples=2):
    """Loads a JSONL or JSON array file and returns a few samples."""
    samples = []
    with open(file_path, 'r') as f:
        first_char = f.read(1)
        f.seek(0)

        if first_char == '[':

            try:
                data = json.load(f)
                samples = data[:num_samples]
            except json.JSONDecodeError as e:
                print(f"Error reading JSON array file {file_path}: {e}")
        else:

            for i, line in enumerate(f):
                if i >= num_samples:
                    break
                line = line.strip()
                if line:
                    try:
                        samples.append(json.loads(line))
                    except json.JSONDecodeError:
                        print(f"Skipping malformed line in {file_path}: {line}")
    return samples

def print_blimp_samples(samples):
    """Prints a cleaner view of selected fields from blimp sample data."""
    for sample in samples:
        doc = sample.get("doc", {})
        sentence_good = doc.get("sentence_good", "N/A")
        sentence_bad = doc.get("sentence_bad", "N/A")
        acc = sample.get("acc", "N/A")
        resps = sample.get("resps", [])
        filtered_resps = sample.get("filtered_resps", [])

        print("Sentence (Good):", sentence_good)
        print("Sentence (Bad):", sentence_bad)
        print("Accuracy:", acc)

        if resps:
            print("Predictions (resps):")
            for i, resp in enumerate(resps):
                score, correct = resp[0][0], resp[0][1]
                print(f"  Response {i + 1}: Score = {score}, Correct = {correct}")

        if filtered_resps:
            print("Filtered Predictions (filtered_resps):")
            for i, filtered_resp in enumerate(filtered_resps):
                score, correct = filtered_resp
                print(f"  Filtered Response {i + 1}: Score = {score}, Correct = {correct}")

        print("=" * 40)

def print_ewok_samples(samples):
    """Prints a cleaner view of selected fields from ewok sample data."""
    for sample in samples:
        doc = sample.get("doc", {})
        domain = doc.get("Domain", "N/A")
        concept_a = doc.get("ConceptA", "N/A")
        concept_b = doc.get("ConceptB", "N/A")
        context1 = doc.get("Context1", "N/A")
        context2 = doc.get("Context2", "N/A")
        target1 = doc.get("Target1", "N/A")
        target2 = doc.get("Target2", "N/A")
        acc = sample.get("acc", "N/A")
        resps = sample.get("resps", [])
        filtered_resps = sample.get("filtered_resps", [])

        print("Domain:", domain)
        print("Concept A:", concept_a)
        print("Concept B:", concept_b)
        print("Context 1:", context1)
        print("Context 2:", context2)
        print("Target 1:", target1)
        print("Target 2:", target2)
        print("Accuracy:", acc)

        if resps:
            print("Predictions (resps):")
            for i, resp in enumerate(resps):
                score, correct = resp[0][0], resp[0][1]
                print(f"  Response {i + 1}: Score = {score}")

        if filtered_resps:
            print("Filtered Predictions (filtered_resps):")
            for i, filtered_resp in enumerate(filtered_resps):
                score, correct = filtered_resp
                print(f"  Filtered Response {i + 1}: Score = {score}")

        print("=" * 40)


In [None]:

blimp_path = '/content/drive/MyDrive/new_evaluation_pipeline/results/blimp/babylama-attentionchange_correct'
ewok_path = '/content/drive/MyDrive/new_evaluation_pipeline/results/ewok/babylama-attentionchange_correct'



In [None]:
# Process and print blimp samples
print("Processing Blimp Tasks\n" + "="*60)
for subtask_file in os.listdir(blimp_path):
    if subtask_file.endswith('.jsonl'):
        file_path = os.path.join(blimp_path, subtask_file)
        print(f"\nSubtask: {subtask_file}")
        samples = load_jsonl_or_json_array(file_path)
        print_blimp_samples(samples)


Processing Blimp Tasks

Subtask: blimp_adjunct_island_filtered_results.jsonl
Sentence (Good): Who should Derek hug after shocking Richard?
Sentence (Bad): Who should Derek hug Richard after shocking?
Accuracy: 1.0
Predictions (resps):
  Response 1: Score = -106.24250793457031, Correct = False
  Response 2: Score = -106.40333557128906, Correct = False
Filtered Predictions (filtered_resps):
  Filtered Response 1: Score = -106.24250793457031, Correct = False
  Filtered Response 2: Score = -106.40333557128906, Correct = False
Sentence (Good): What had Theresa walked through while talking about that high school?
Sentence (Bad): What had Theresa walked through that high school while talking about?
Accuracy: 0.0
Predictions (resps):
  Response 1: Score = -121.53715515136719, Correct = False
  Response 2: Score = -118.0936279296875, Correct = False
Filtered Predictions (filtered_resps):
  Filtered Response 1: Score = -121.53715515136719, Correct = False
  Filtered Response 2: Score = -118.0936

In [None]:
# Process and print ewok samples
print("\nProcessing Ewok Tasks\n" + "="*60)
for subtask_file in os.listdir(ewok_path):
    if subtask_file.endswith('.jsonl'):
        file_path = os.path.join(ewok_path, subtask_file)
        print(f"\nSubtask: {subtask_file}")
        samples = load_jsonl_or_json_array(file_path)
        print_ewok_samples(samples)


Processing Ewok Tasks

Subtask: ewok_agent-properties_filtered_results.jsonl
Domain: agent-properties
Concept A: believe
Concept B: doubt
Context 1: Ali is in the bakery. Ali sees the candle inside.
Context 2: Ali is in the bakery. Ali sees the candle outside.
Target 1: Ali believes that the candle is in the bakery.
Target 2: Ali doubts that the candle is in the bakery.
Accuracy: 1.0
Predictions (resps):
  Response 1: Score = -62.393123626708984
  Response 2: Score = -70.64193725585938
Filtered Predictions (filtered_resps):
  Filtered Response 1: Score = -62.393123626708984
  Filtered Response 2: Score = -70.64193725585938
Domain: agent-properties
Concept A: believe
Concept B: doubt
Context 1: Ali is in the bakery. Ali sees the candle outside.
Context 2: Ali is in the bakery. Ali sees the candle inside.
Target 1: Ali doubts that the candle is in the bakery.
Target 2: Ali believes that the candle is in the bakery.
Accuracy: 0.0
Predictions (resps):
  Response 1: Score = -71.09725952148