In [None]:
# Path to your local repo
git_repo_filepath = '/content/episodic-memory-benchmark'


### Loading books (unchanged — uses existing generation wrapper)

In [None]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper

book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path('/content/episodic-memory-benchmark/epbench/data')
env_file = Path('/content/episodic-memory-benchmark/.env')

# Generation with Claude -- 20 events (kept as-is so benchmark generation is preserved)
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_default_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_default_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print('Books loaded (generation wrapper created).')


### Helper: short name mapping (recognizes Qwen)

In [None]:
def get_short_name_from_model_name(answering_model_name, answering_kind, answering_embedding_chunk):
    # lightweight mapper to create short display names
    if 'gpt-4o-mini' in answering_model_name:
        model_name = 'gpt-4o-mini'
    elif 'gpt-4o' in answering_model_name:
        model_name = 'gpt-4o'
    elif 'claude-3-5-sonnet' in answering_model_name:
        model_name = 'cl-3.5-sonnet'
    elif 'claude-3-haiku' in answering_model_name:
        model_name = 'cl-3-haiku'
    elif 'o1-mini' in answering_model_name:
        model_name = 'o1-mini'
    elif 'llama-3.1' in answering_model_name:
        model_name = 'llama-3.1'
    elif 'qwen3-vl' in answering_model_name or 'qwen/qwen3' in answering_model_name:
        model_name = 'qwen3-vl'
    else:
        model_name = answering_model_name.split('/')[-1]
    
    if answering_kind == 'prompting':
        output = model_name
    elif answering_kind == 'rag':
        if answering_embedding_chunk == 'chapter':
            output = f"{model_name} (rag, c)"
        else:
            output = f"{model_name} (rag)"
    elif answering_kind == 'ftuning':
        output = f"{model_name} (ftuning)"
    else:
        output = model_name

    return output


### Experiments configuration — replaced models with Qwen
This list drives `get_precomputed_results` below. I replaced the previous multi-provider list with entries using `qwen/qwen3-vl-8b-thinking`. Keep ftuning entries as placeholders (most hosted OpenRouter endpoints do not support provider-side fine-tuning — you must enable only if you have that feature).

In [None]:
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # prompting, 20 events
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking'},
    # prompting, 200 events
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking'},
    # RAG, paragraph + chapter (placeholders)
    {'book_nb_events': 20,  'answering_kind': 'rag', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking', 'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 20,  'answering_kind': 'rag', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking', 'answering_embedding_chunk': 'chapter'},
    {'book_nb_events': 200, 'answering_kind': 'rag', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking', 'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking', 'answering_embedding_chunk': 'chapter'},
    # fine-tuning placeholders (do NOT enable unless your provider supports hosted fine-tuning and you have credentials)
    {'book_nb_events': 20,  'answering_kind': 'ftuning', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking'},
    {'book_nb_events': 200, 'answering_kind': 'ftuning', 'answering_model_name': 'qwen/qwen3-vl-8b-thinking'},
]

for i in range(len(experiments)):
    if 'answering_embedding_chunk' not in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    # keep book model name consistent (this is used by some scripts expecting a 'book_model_name')
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"Configured {len(experiments)} experiments (Qwen entries).")

all_benchmarks = {
    'benchmark_claude_default_20': benchmark_claude_default_20,
    'benchmark_claude_default_200': benchmark_claude_default_200
}

# Run precomputed-results helper (this will create EvaluationWrapper objects for each experiment)
df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df  # show dataframe object


### If you want to run answering loops directly (prompting / rag / ftuning) using EvaluationWrapper
Below are example cells you can run instead of `get_precomputed_results` if you want more direct control. They call `EvaluationWrapper` for the Qwen model.
- **Important**: make sure your `env_file` contains correct API keys / provider configs. For OpenRouter-style endpoints you likely need `OPENROUTER_API_KEY` or similar in that `.env` and the underlying `epbench` code must support that provider.
- Fine-tuning (`ftuning`) is left as a placeholder and uses `ftuning_need_actual_tune=False` so it won't attempt hosted tuning unless you set those flags.

In [None]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper
from epbench.src.evaluation.generator_answers_2_rag import get_top_n

# --- Prompting (in-context) ---
for my_benchmark in [benchmark_claude_default_20, benchmark_claude_default_200]:
    model_name = 'qwen/qwen3-vl-8b-thinking'
    answering_parameters = {
        'kind': 'prompting',
        'model_name': model_name,
        'max_new_tokens': 4096,
        'sleeping_time': 1,
        'policy': 'remove_duplicates'
    }
    print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using {model_name}")
    my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print('Prompting experiments finished (Qwen).')


In [None]:
# --- RAG experiments (builds on embeddings + retrieval) ---
for my_benchmark in [benchmark_claude_default_20, benchmark_claude_default_200]:
    for embedding_chunk in ['paragraph', 'chapter']:
        model_name = 'qwen/qwen3-vl-8b-thinking'
        answering_parameters = {
            'kind': 'rag',
            'model_name': model_name,
            'embedding_chunk': embedding_chunk,
            'max_new_tokens': 4096,
            'sleeping_time': 0,
            'embedding_model': 'text-embedding-3-small',
            'embedding_batch_size': 2048,
            'top_n': get_top_n(embedding_chunk, my_benchmark),
            'policy': 'remove_duplicates'
        }
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with rag using {model_name} ({embedding_chunk})")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print('RAG experiments finished (Qwen).')


In [None]:
# --- Fine-tuning placeholders ---
for my_benchmark in [benchmark_claude_default_20, benchmark_claude_default_200]:
    model_name = 'qwen/qwen3-vl-8b-thinking'
    answering_parameters = {
        'kind': 'ftuning',
        'model_name': model_name,
        'max_new_tokens': 4096,
        'sleeping_time': 0,
        # The code below will NOT actually perform hosted tuning unless you enable ftuning_need_actual_tune=True
        'ftuning_input_data_policy': 'single',
        'ftuning_need_upload': False,
        'ftuning_need_actual_tune': False,
        'batch_size': 'auto',
        'learning_rate_multiplier': 'auto',
        'n_epochs': 5,
        'policy': 'remove_duplicates'
    }
    print(f"(Placeholder) Document with {my_benchmark.nb_tokens()} tokens, ftuning using {model_name} — ftuning disabled by default")
    my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print('Ftuning placeholders executed (no hosted tuning by default).')


### Analysis / plotting cells (kept as in original notebook)
Run these after `df` (the dataframe built earlier) is available.

In [None]:
import pandas as pd
from epbench.src.results.average_groups import extract_groups
nb_events = 200
relative_to = ['get', 'bins_items_correct_answer']
df_results = extract_groups(df, nb_events, relative_to)
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results.head()


### Notes / next steps
- If the notebook UI still shows old content after you edited the raw JSON, **restart kernel and clear outputs**, then re-run the notebook top→bottom.
- For OpenRouter / open-source-hosted Qwen endpoints: ensure your `.env` contains the proper variables expected by the `epbench` code (check `epbench` auth/provider wrappers). Many community endpoints require different env var names; inspect `epbench` provider code if you need to change key names.
- Fine-tuning: provider dependent. Most public endpoints (OpenRouter) do not offer the same hosted fine-tuning mechanics as OpenAI; treat ftuning entries as placeholders unless you have a provider-specific API and the `epbench` wrapper supports it.
