In [1]:
git_repo_filepath = '/content/episodic-memory-benchmark'

# Book generation -- End to end
- Generate the books and the corresponding question/answer pairs
- If the data already exists, only load books and regenerate QA pairs
- Additional benchmarks have been generated, please look at `rebuttal_generating_book_variations.ipynb` for the complete list
- Duration for loading the books and regenerate the questions: 1 minute

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

print("Generation with Claude -- 10 events")
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 5} # itermax is integer, 1 for a single try
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file, rechecking=False)

print("Generation with Claude -- 200 events")
prompt_parameters = {'nb_events': 50, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file, rechecking=False)

print("Two books+QAs generated")

Generation with Claude -- 10 events


At iteration 0, 33.70% remaining with issues (674/2000), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125, 127, 128, 130, 136, 138, 143, 144, 146, 147, 148, 149, 150, 152, 155, 156, 160, 162, 163, 166, 169, 172, 175, 177, 178, 180, 181, 182, 185, 189, 193, 197, 199, 211, 223, 224, 225, 230, 234, 241, 245, 248, 250, 251, 252, 260, 266, 275, 277, 278, 279, 281, 282, 283, 285, 286, 293, 294, 297, 301, 305, 308, 311, 314, 315, 317, 319, 320, 324, 325, 327, 328, 329, 330, 331, 333, 334, 337, 339, 341, 344, 345, 349, 353, 355, 362, 365, 372, 376, 377, 378, 389, 395, 398, 400, 401, 407, 408, 411, 414, 415, 420, 423, 427, 431, 432, 433, 442, 443, 447, 451, 453, 454, 455, 462, 468, 470, 472, 473, 479, 481, 486, 487, 491, 494, 495, 498, 499, 500, 501, 503, 504, 505, 506, 508, 511, 512, 515, 516, 518, 521, 525, 527, 528, 530, 536, 538, 540, 542, 548, 550, 552, 553, 555, 556, 557, 558, 563, 56

At iteration 1, 16.65% remaining with issues (333/2000), for index: [11, 13, 16, 42, 44, 56, 59, 67, 79, 80, 93, 96, 106, 122, 127, 128, 130, 136, 143, 144, 146, 147, 150, 156, 160, 162, 163, 166, 169, 172, 175, 182, 193, 211, 223, 224, 225, 234, 245, 248, 250, 251, 252, 266, 275, 278, 279, 281, 283, 285, 297, 301, 308, 311, 315, 317, 319, 324, 327, 331, 337, 341, 344, 353, 355, 372, 376, 378, 389, 395, 398, 400, 407, 411, 420, 433, 442, 451, 453, 454, 462, 472, 473, 486, 491, 494, 495, 498, 500, 501, 503, 504, 506, 508, 512, 516, 525, 527, 528, 540, 542, 552, 555, 557, 558, 563, 572, 575, 591, 606, 607, 618, 625, 626, 638, 654, 657, 662, 663, 667, 668, 685, 694, 721, 723, 725, 727, 730, 738, 739, 744, 756, 757, 758, 777, 786, 797, 801, 810, 811, 814, 816, 821, 822, 844, 845, 848, 849, 864, 871, 881, 883, 897, 902, 906, 915, 924, 925, 931, 940, 943, 946, 953, 956, 959, 961, 967, 976, 992, 998, 1006, 1009, 1014, 1025, 1038, 1043, 1044, 1057, 1059, 1060, 1068, 1085, 1106, 1110, 1114, 112

At iteration 2, 9.75% remaining with issues (195/2000), for index: [13, 16, 42, 44, 56, 67, 79, 93, 96, 106, 143, 144, 146, 150, 156, 160, 162, 166, 169, 182, 193, 211, 223, 245, 266, 275, 279, 283, 285, 297, 311, 319, 324, 331, 355, 372, 376, 378, 395, 398, 400, 420, 433, 453, 454, 472, 494, 498, 500, 501, 504, 506, 508, 516, 528, 540, 542, 552, 555, 557, 563, 625, 638, 654, 663, 667, 694, 727, 738, 739, 756, 757, 777, 801, 814, 821, 822, 844, 845, 871, 881, 883, 897, 915, 924, 931, 940, 943, 953, 956, 959, 961, 967, 998, 1009, 1025, 1038, 1043, 1044, 1057, 1060, 1068, 1085, 1121, 1123, 1131, 1134, 1142, 1168, 1169, 1214, 1216, 1217, 1249, 1258, 1261, 1285, 1297, 1300, 1304, 1311, 1356, 1357, 1360, 1371, 1376, 1380, 1383, 1393, 1415, 1418, 1426, 1437, 1449, 1450, 1452, 1454, 1456, 1467, 1502, 1505, 1520, 1521, 1523, 1541, 1547, 1553, 1554, 1557, 1564, 1565, 1589, 1597, 1598, 1608, 1628, 1671, 1675, 1683, 1694, 1712, 1717, 1720, 1722, 1725, 1727, 1728, 1730, 1747, 1748, 1760, 1763, 179

At iteration 3, 6.50% remaining with issues (130/2000), for index: [16, 42, 44, 56, 67, 93, 96, 106, 143, 144, 146, 156, 160, 182, 193, 211, 245, 266, 283, 331, 376, 395, 398, 453, 472, 498, 501, 506, 508, 516, 528, 540, 542, 552, 555, 563, 638, 694, 727, 738, 739, 757, 777, 801, 814, 822, 871, 883, 897, 924, 940, 956, 959, 961, 967, 1009, 1025, 1038, 1043, 1044, 1060, 1068, 1085, 1121, 1134, 1168, 1169, 1214, 1217, 1258, 1261, 1285, 1297, 1300, 1304, 1311, 1356, 1357, 1371, 1376, 1380, 1393, 1415, 1418, 1426, 1437, 1449, 1450, 1454, 1456, 1467, 1520, 1521, 1523, 1553, 1554, 1564, 1565, 1597, 1598, 1608, 1671, 1675, 1694, 1712, 1717, 1722, 1725, 1727, 1728, 1730, 1748, 1792, 1807, 1817, 1826, 1832, 1855, 1882, 1918, 1919, 1924, 1932, 1945, 1952, 1959, 1962, 1976, 1980, 1996].


At final iteration 4, 4.95% remaining with issues (99/2000), for index: [16, 42, 44, 56, 67, 93, 146, 156, 160, 182, 193, 211, 245, 266, 283, 331, 453, 472, 498, 501, 506, 508, 516, 540, 542, 555, 563, 638, 738, 739, 757, 777, 801, 814, 871, 883, 897, 924, 959, 961, 967, 1009, 1025, 1038, 1043, 1044, 1060, 1121, 1134, 1168, 1217, 1261, 1285, 1297, 1300, 1304, 1356, 1357, 1371, 1376, 1380, 1415, 1418, 1437, 1449, 1450, 1454, 1456, 1520, 1521, 1553, 1554, 1564, 1565, 1597, 1671, 1675, 1694, 1712, 1717, 1722, 1725, 1727, 1728, 1730, 1792, 1807, 1826, 1832, 1882, 1918, 1919, 1924, 1945, 1952, 1962, 1976, 1980, 1996].
itermax reached but some events still did not pass the verification


for 1M chapter book, only select a subset of the questions
Generation with Claude -- 200 events


At iteration 0, 33.70% remaining with issues (674/2000), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125, 127, 128, 130, 136, 138, 143, 144, 146, 147, 148, 149, 150, 152, 155, 156, 160, 162, 163, 166, 169, 172, 175, 177, 178, 180, 181, 182, 185, 189, 193, 197, 199, 211, 223, 224, 225, 230, 234, 241, 245, 248, 250, 251, 252, 260, 266, 275, 277, 278, 279, 281, 282, 283, 285, 286, 293, 294, 297, 301, 305, 308, 311, 314, 315, 317, 319, 320, 324, 325, 327, 328, 329, 330, 331, 333, 334, 337, 339, 341, 344, 345, 349, 353, 355, 362, 365, 372, 376, 377, 378, 389, 395, 398, 400, 401, 407, 408, 411, 414, 415, 420, 423, 427, 431, 432, 433, 442, 443, 447, 451, 453, 454, 455, 462, 468, 470, 472, 473, 479, 481, 486, 487, 491, 494, 495, 498, 499, 500, 501, 503, 504, 505, 506, 508, 511, 512, 515, 516, 518, 521, 525, 527, 528, 530, 536, 538, 540, 542, 548, 550, 552, 553, 555, 556, 557, 558, 563, 56

At iteration 1, 16.65% remaining with issues (333/2000), for index: [11, 13, 16, 42, 44, 56, 59, 67, 79, 80, 93, 96, 106, 122, 127, 128, 130, 136, 143, 144, 146, 147, 150, 156, 160, 162, 163, 166, 169, 172, 175, 182, 193, 211, 223, 224, 225, 234, 245, 248, 250, 251, 252, 266, 275, 278, 279, 281, 283, 285, 297, 301, 308, 311, 315, 317, 319, 324, 327, 331, 337, 341, 344, 353, 355, 372, 376, 378, 389, 395, 398, 400, 407, 411, 420, 433, 442, 451, 453, 454, 462, 472, 473, 486, 491, 494, 495, 498, 500, 501, 503, 504, 506, 508, 512, 516, 525, 527, 528, 540, 542, 552, 555, 557, 558, 563, 572, 575, 591, 606, 607, 618, 625, 626, 638, 654, 657, 662, 663, 667, 668, 685, 694, 721, 723, 725, 727, 730, 738, 739, 744, 756, 757, 758, 777, 786, 797, 801, 810, 811, 814, 816, 821, 822, 844, 845, 848, 849, 864, 871, 881, 883, 897, 902, 906, 915, 924, 925, 931, 940, 943, 946, 953, 956, 959, 961, 967, 976, 992, 998, 1006, 1009, 1014, 1025, 1038, 1043, 1044, 1057, 1059, 1060, 1068, 1085, 1106, 1110, 1114, 112

At iteration 2, 9.75% remaining with issues (195/2000), for index: [13, 16, 42, 44, 56, 67, 79, 93, 96, 106, 143, 144, 146, 150, 156, 160, 162, 166, 169, 182, 193, 211, 223, 245, 266, 275, 279, 283, 285, 297, 311, 319, 324, 331, 355, 372, 376, 378, 395, 398, 400, 420, 433, 453, 454, 472, 494, 498, 500, 501, 504, 506, 508, 516, 528, 540, 542, 552, 555, 557, 563, 625, 638, 654, 663, 667, 694, 727, 738, 739, 756, 757, 777, 801, 814, 821, 822, 844, 845, 871, 881, 883, 897, 915, 924, 931, 940, 943, 953, 956, 959, 961, 967, 998, 1009, 1025, 1038, 1043, 1044, 1057, 1060, 1068, 1085, 1121, 1123, 1131, 1134, 1142, 1168, 1169, 1214, 1216, 1217, 1249, 1258, 1261, 1285, 1297, 1300, 1304, 1311, 1356, 1357, 1360, 1371, 1376, 1380, 1383, 1393, 1415, 1418, 1426, 1437, 1449, 1450, 1452, 1454, 1456, 1467, 1502, 1505, 1520, 1521, 1523, 1541, 1547, 1553, 1554, 1557, 1564, 1565, 1589, 1597, 1598, 1608, 1628, 1671, 1675, 1683, 1694, 1712, 1717, 1720, 1722, 1725, 1727, 1728, 1730, 1747, 1748, 1760, 1763, 179

At iteration 3, 6.50% remaining with issues (130/2000), for index: [16, 42, 44, 56, 67, 93, 96, 106, 143, 144, 146, 156, 160, 182, 193, 211, 245, 266, 283, 331, 376, 395, 398, 453, 472, 498, 501, 506, 508, 516, 528, 540, 542, 552, 555, 563, 638, 694, 727, 738, 739, 757, 777, 801, 814, 822, 871, 883, 897, 924, 940, 956, 959, 961, 967, 1009, 1025, 1038, 1043, 1044, 1060, 1068, 1085, 1121, 1134, 1168, 1169, 1214, 1217, 1258, 1261, 1285, 1297, 1300, 1304, 1311, 1356, 1357, 1371, 1376, 1380, 1393, 1415, 1418, 1426, 1437, 1449, 1450, 1454, 1456, 1467, 1520, 1521, 1523, 1553, 1554, 1564, 1565, 1597, 1598, 1608, 1671, 1675, 1694, 1712, 1717, 1722, 1725, 1727, 1728, 1730, 1748, 1792, 1807, 1817, 1826, 1832, 1855, 1882, 1918, 1919, 1924, 1932, 1945, 1952, 1959, 1962, 1976, 1980, 1996].


At iteration 4, 4.95% remaining with issues (99/2000), for index: [16, 42, 44, 56, 67, 93, 146, 156, 160, 182, 193, 211, 245, 266, 283, 331, 453, 472, 498, 501, 506, 508, 516, 540, 542, 555, 563, 638, 738, 739, 757, 777, 801, 814, 871, 883, 897, 924, 959, 961, 967, 1009, 1025, 1038, 1043, 1044, 1060, 1121, 1134, 1168, 1217, 1261, 1285, 1297, 1300, 1304, 1356, 1357, 1371, 1376, 1380, 1415, 1418, 1437, 1449, 1450, 1454, 1456, 1520, 1521, 1553, 1554, 1564, 1565, 1597, 1671, 1675, 1694, 1712, 1717, 1722, 1725, 1727, 1728, 1730, 1792, 1807, 1826, 1832, 1882, 1918, 1919, 1924, 1945, 1952, 1962, 1976, 1980, 1996].


At iteration 5, 3.85% remaining with issues (77/2000), for index: [16, 56, 67, 93, 146, 156, 160, 182, 193, 211, 245, 266, 283, 331, 453, 472, 506, 508, 516, 540, 542, 555, 563, 638, 738, 777, 801, 814, 871, 883, 897, 924, 959, 961, 1009, 1038, 1043, 1044, 1060, 1121, 1134, 1217, 1261, 1285, 1304, 1356, 1357, 1371, 1376, 1380, 1415, 1418, 1437, 1450, 1454, 1456, 1520, 1521, 1553, 1554, 1564, 1597, 1671, 1675, 1694, 1712, 1722, 1725, 1727, 1728, 1832, 1882, 1918, 1919, 1952, 1980, 1996].


At iteration 6, 3.00% remaining with issues (60/2000), for index: [16, 56, 67, 156, 160, 182, 211, 245, 266, 331, 453, 506, 508, 516, 542, 555, 777, 814, 871, 883, 897, 924, 959, 961, 1009, 1038, 1043, 1044, 1060, 1121, 1134, 1217, 1261, 1285, 1304, 1356, 1371, 1380, 1418, 1450, 1454, 1456, 1520, 1521, 1553, 1554, 1597, 1671, 1675, 1694, 1712, 1722, 1725, 1727, 1728, 1832, 1882, 1919, 1952, 1980].


At iteration 7, 2.35% remaining with issues (47/2000), for index: [16, 56, 67, 156, 160, 211, 245, 331, 453, 508, 516, 542, 555, 777, 814, 871, 883, 897, 959, 961, 1009, 1038, 1043, 1060, 1134, 1217, 1261, 1285, 1304, 1356, 1380, 1418, 1454, 1456, 1520, 1521, 1553, 1671, 1675, 1694, 1722, 1725, 1727, 1728, 1919, 1952, 1980].


At iteration 8, 2.10% remaining with issues (42/2000), for index: [16, 56, 156, 160, 245, 331, 453, 508, 516, 542, 555, 777, 814, 871, 883, 897, 959, 961, 1009, 1038, 1043, 1060, 1134, 1261, 1285, 1304, 1356, 1380, 1418, 1454, 1456, 1520, 1521, 1553, 1671, 1722, 1725, 1727, 1728, 1919, 1952, 1980].


At final iteration 9, 1.65% remaining with issues (33/2000), for index: [16, 56, 156, 160, 245, 331, 453, 508, 516, 555, 777, 871, 883, 897, 959, 961, 1009, 1038, 1261, 1285, 1304, 1380, 1418, 1454, 1456, 1520, 1521, 1553, 1671, 1728, 1919, 1952, 1980].
itermax reached but some events still did not pass the verification


for 1M chapter book, only select a subset of the questions
Two books+QAs generated


# Exploration

In [3]:
# Selection of the book of interest
my_benchmark = benchmark_claude_200

#### Initial exploration

In [4]:
# Get event 0, as a list
my_benchmark.events[0]

['September 13, 2025',
 'Bethpage Black Course',
 'Ezra Edwards',
 'Parkour Workshop',
 'Demonstrated cat leaps']

In [5]:
# Get metadata for event 0, as a dictionary
my_benchmark.meta_events[0]

{'nb_paragraphs': 7,
 'idx_paragraph': {'location': 2, 'date': 7, 'entity': 2, 'content': 2},
 'style': 'thriller'}

#### Raw generated sample indexed by (event, iteration), before chaptering [debug]

In [6]:
# Print the successful iteration for an event_idx (those are raw samples, it is not the chapter indexing yet)
event_idx = 3
my_benchmark.pretty_print_debug_event_idx(event_idx)

[32m*Correct* sample (event=3, iter=0)[0m

[34m['May 07, 2024', 'Hither Hills State Park', 'Zoe Brown', 'Karaoke Night', 'Performed with live band accompaniment'][0m
[34m{'nb_paragraphs': 5, 'idx_paragraph': {'location': 3, 'date': 1, 'entity': 3, 'content': 4}, 'style': 'mystery'}[0m
[34m['Alma Aultman', 'Alondra Wilkinson'][0m
[90mGenerated chapter has 468 tokens[0m

The sultry evening air clung to her skin as she stepped out of the car, the distant thrum of music pulsing through the darkness. [30;42mMay 07,
2024[0m had finally arrived, and with it, the promise of an unforgettable night. She smoothed down her sequined dress, its sparkles catching the dim
light from the parking lot lamps. The path ahead wound through shadowy trees, leading to a clearing where laughter and the occasional off-key note
drifted on the breeze.

As she approached the makeshift stage, her heart raced with anticipation. The park's usual serenity had been transformed into a vibrant hub of
activity.

In [7]:
# Print the `event_idx` that are all invalid, even after all the iterations
invalid_samples = my_benchmark.invalid_debug_event_idx_func()
print(f"Invalid sample(s): {invalid_samples}")

Invalid sample(s): [16]


In [8]:
# Print details for an invalid sample
if len(invalid_samples) > 0:
    my_benchmark.pretty_print_debug_event_iter_idx(invalid_samples[0]) # by default take the last iteration

[31m*Incorrect* sample (event=16, iter=9)[0m
[90mIssue in *llm* verification: ['date'], as the answer is: Based on my analysis of the given text, here are my answers to the questions in the requested JSON format:

{
    "1": true,
    "2": false,
    "3": true,
    "4": true
}[0m

[34m['June 14, 2025', 'High Line', 'Zoe Brown', 'Tech Hackathon', 'Presented final projects'][0m
[34m{'nb_paragraphs': 2, 'idx_paragraph': {'location': 2, 'date': 2, 'entity': 2, 'content': 2}, 'style': 'horror'}[0m

(1) The air crackled with an eerie static as she stepped onto the elevated platform. Rows of flickering screens cast an otherworldly glow across the
faces of hunched figures, their fingers dancing frantically across keyboards. The acrid scent of ozone and desperation hung thick in the air. She
clutched her prototype close, its cold metal surface a stark contrast to her clammy palms. The distant rumble of thunder seemed to echo the pounding
of her heart. As she made her way through the lab

In [9]:
# Print a specific event_idx and iter_idx (for debugging)
my_benchmark.pretty_print_debug_event_iter_idx(3,0)

[32m*Correct* sample (event=3, iter=0)[0m

[34m['May 07, 2024', 'Hither Hills State Park', 'Zoe Brown', 'Karaoke Night', 'Performed with live band accompaniment'][0m
[34m{'nb_paragraphs': 5, 'idx_paragraph': {'location': 3, 'date': 1, 'entity': 3, 'content': 4}, 'style': 'mystery'}[0m

(1) The sultry evening air clung to her skin as she stepped out of the car, the distant thrum of music pulsing through the darkness. [30;42mMay 07,
2024[0m had finally arrived, and with it, the promise of an unforgettable night. She smoothed down her sequined dress, its sparkles catching the dim
light from the parking lot lamps. The path ahead wound through shadowy trees, leading to a clearing where laughter and the occasional off-key note
drifted on the breeze.

(2) As she approached the makeshift stage, her heart raced with anticipation. The park's usual serenity had been transformed into a vibrant hub of
activity. Fairy lights twinkled in the trees, casting a warm glow over the assembled crowd

#### At the book side

In [10]:
# Print the full book
my_benchmark.pretty_print_book()

Chapter 1

The harsh glare of the floodlights cast long shadows across the rugged terrain. Adrenaline coursed through his veins as
he surveyed the daunting obstacles before him. The parkour workshop had drawn a diverse crowd, each participant eager
to push their limits and master the urban art of movement. He flexed his fingers, anticipation building in his chest as
he prepared for the night's challenges.

At Bethpage Black Course, Ezra Edwards demonstrated cat leaps with a grace that belied the difficulty of the maneuver.
The other participants watched in awe as he effortlessly cleared the gap between two towering structures. Noa
Middleton, the lead instructor, nodded approvingly, a hint of a smile playing at the corners of their mouth.

As the workshop progressed, the air grew thick with tension. The obstacles became increasingly complex, testing the
limits of even the most seasoned traceurs. He felt a bead of sweat trickle down his spine as he approached the next
challenge – a serie

In [11]:
# Print a single chapter (note the difference of indexing with the original event index, since some indexes have been discarded)
my_benchmark.pretty_print_book_chapter(193)

Chapter 193 does not exist. Available chapters: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [12]:
# exact mapping between chapters and original raw event indexes
my_benchmark.debug_mapping_chapter_idx_to_event_idx

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49}

In [13]:
# Get the number of tokens
print(f"{my_benchmark.nb_tokens()} tokens in the book")
# Get the number of chapters
print(f"{my_benchmark.nb_chapters()} chapters in the book")

27082 tokens in the book
49 chapters in the book


#### Ground truth

In [14]:
# Ground truth for each chapter
df_book_groundtruth = my_benchmark.df_book_groundtruth
df_book_groundtruth

Unnamed: 0_level_0,chapter,date,location,entity,content,post_entities,n_date,n_location,n_entity,n_content,raw_generated_paragraph_idx,nb_paragraphs,style,idx_t,idx_s,idx_e,idx_c
chapter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,"September 13, 2025",Bethpage Black Course,Ezra Edwards,Parkour Workshop,"{Noa Middleton, Mara Ledbetter}",2,1,2,3,0,7,thriller,7,2,2,2
2,2,"September 22, 2026",American Museum of Natural History,Chloe Castillo,Fashion Show,"{Sienna Hamrick, Reid Blunt}",5,3,2,5,1,7,fantasy,7,5,5,3
3,3,"September 22, 2026",Port Jefferson,Henry Reed,Photography Exhibition,"{Miles Pritchett, Amira Hayes, Ronan Guevara}",5,3,7,1,2,1,detective,1,1,1,1
4,4,"May 07, 2024",Hither Hills State Park,Zoe Brown,Karaoke Night,"{Alondra Wilkinson, Alma Aultman}",1,1,5,2,3,5,mystery,1,3,3,4
5,5,"March 23, 2024",High Line,Logan Diaz,Business Networking Event,"{Meredith Gardner, Uri Wemple}",4,8,1,1,4,9,romance,2,2,8,8
6,6,"May 31, 2024",Woolworth Building,Julian Ross,Tech Hackathon,"{Hezekiah Bean, Sienna Lyles}",1,3,4,6,5,8,detective,6,8,2,3
7,7,"February 27, 2026",Washington Square Park,Benjamin Green,Ice Sculpture Exhibition,"{Dakari Liebert, Creed Berlin}",5,4,2,2,6,7,tragedy,7,7,5,1
8,8,"September 27, 2025",Snug Harbor Cultural Center,Owen Thomas,Parkour Workshop,"{Axl Mims, Rocco Cormier, Lilah Hamilton}",2,2,1,3,7,5,comedy,2,4,1,1
9,9,"February 27, 2026",Statue of Liberty,Mila Gonzalez,Theater Performance,"{Diem Nix, Eli Groth, Rey Pierre}",5,1,4,7,8,8,fantasy,5,8,8,1
10,10,"October 13, 2024",Queensboro Bridge,Henry Reed,Film Festival,"{Oakley Goetz, Zander Davenport}",3,1,7,3,9,6,mystery,3,6,3,4


#### Questions

In [15]:
# Selected questions
df_qa = my_benchmark.df_qa
# df_qa.iloc[0][['question', 'correct_answer', 'correct_answer_chapters']]
df_qa

Unnamed: 0,q_idx,bins_items_correct_answer,debug_level_2,question,cue,cue_completed,retrieval_type,get,correct_answer,correct_answer_chapters,correct_answer_detailed,n_items_correct_answer,n_chapters_correct_answer,debug_changed,debug_existing_change
0,8,0,905,Think about Ava Gray's experiences. Describe a...,"(*, *, ent, *)","(*, *, {Ava Gray}, *)",Event contents,all,{},{},{},0,0,"{entity, location}",False
1,35,0,700,Enumerate all activities that Thomas Nelson ha...,"(*, *, ent, *)","(*, *, {Thomas Nelson}, *)",Event contents,chronological,{},{},{},0,0,"{entity, content}",False
2,9,"{3,4,5}",222,Recall all events related to Film Festival. Pr...,"(*, *, *, c)","(*, *, *, {Film Festival})",Times,all,"{October 13, 2024, October 27, 2024}","{41, 10, 28}","{10: 'October 13, 2024', 28: 'October 27, 2024...",2,3,{},
3,11,1,389,Reflect on events related to Book Signing Even...,"(*, *, *, c)","(*, *, *, {Book Signing Event})",Entities,all,{Scarlett Thomas},{44},{44: 'Scarlett Thomas'},1,1,{},
4,10,1,27,Consider all events involving TED Talk. List a...,"(*, *, *, c)","(*, *, *, {TED Talk})",Spaces,all,{Williamsburg Bridge},{26},{26: 'Williamsburg Bridge'},1,1,{},
5,10,6+,28,Consider all events involving Tech Hackathon. ...,"(*, *, *, c)","(*, *, *, {Tech Hackathon})",Spaces,all,"{One World Trade Center, Woolworth Building, W...","{37, 6, 40, 14, 16, 20}","{6: 'Woolworth Building', 14: 'Snug Harbor Cul...",6,6,{},
6,7,0,646,Consider all events that Ava Gray has been inv...,"(*, *, ent, *)","(*, *, {Ava Gray}, *)",Spaces,all,{},{},{},0,0,"{entity, location}",False
7,31,0,991,What is the most recent location where Owen Ro...,"(*, *, ent, *)","(*, *, {Owen Rodriguez}, *)",Spaces,latest,{},{},{},0,0,"{entity, content, location, date}",False
8,11,"{3,4,5}",392,Reflect on events related to Film Festival. Pr...,"(*, *, *, c)","(*, *, *, {Film Festival})",Entities,all,"{Julian Ross, Henry Reed, Brooklyn Ross}","{41, 10, 28}","{10: 'Henry Reed', 28: 'Julian Ross', 41: 'Bro...",3,3,{},
9,8,0,905,Think about Ava Gray's experiences. Describe a...,"(*, *, ent, *)","(*, *, {Ava Gray}, *)",Event contents,all,{},{},{},0,0,"{entity, location}",False


In [16]:
# Widespreadness of the questions (with the default bins)
my_benchmark.df_qa_debug_widespreadness

Unnamed: 0_level_0,nb_of_bins_with_at_least_one_question,nb_of_questions_for_the_bin_with_the_least_and_most_questions,nb_of_questions_for_the_bin_with_the_least_and_most_questions
Unnamed: 0_level_1,count,min,max
cue,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(*, *, *, c)",5,1.0,5.0
"(*, *, ent, *)",5,1.0,5.0
"(*, *, ent, c)",3,2.0,5.0
"(*, s, *, *)",5,1.0,5.0
"(*, s, *, c)",3,1.0,5.0
"(*, s, ent, *)",2,5.0,5.0
"(*, s, ent, c)",2,5.0,5.0
"(t, *, *, *)",3,5.0,5.0
"(t, *, *, c)",3,5.0,5.0
"(t, *, ent, *)",2,5.0,5.0


In [17]:
# Complete list of questions related to a single chapter, used for fine-tuning
my_benchmark.finetuning_questions_one_chapter

Unnamed: 0,question,debug_chapter,q_idx,cue,cue_completed,retrieval_type,get,correct_answer,correct_answer_chapters,correct_answer_detailed,n_items_correct_answer,n_chapters_correct_answer,debug_changed,debug_existing_change
1,Consider all events involving Ballet Performan...,[30],10,"(*, *, *, c)","(*, *, *, {Ballet Performance})",Spaces,all,{Port Jefferson},{30},{30: 'Port Jefferson'},1,1,{},
2,Consider all events involving Benjamin Green a...,[14],18,"(*, s, ent, *)","(*, {Snug Harbor Cultural Center}, {Benjamin G...",Times,all,"{April 09, 2026}",{14},"{14: 'April 09, 2026'}",1,1,{},
3,Consider all events involving Benjamin Green a...,[7],18,"(*, s, ent, *)","(*, {Washington Square Park}, {Benjamin Green}...",Times,all,"{February 27, 2026}",{7},"{7: 'February 27, 2026'}",1,1,{},
4,Consider all events involving Carter Stewart a...,[30],18,"(*, s, ent, *)","(*, {Port Jefferson}, {Carter Stewart}, *)",Times,all,"{October 27, 2024}",{30},"{30: 'October 27, 2024'}",1,1,{},
7,Consider all events involving Ezra Edwards at ...,[23],18,"(*, s, ent, *)","(*, {New York Botanical Garden}, {Ezra Edwards...",Times,all,"{December 03, 2026}",{23},"{23: 'December 03, 2026'}",1,1,{},
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,What is the most recent location where Lucy Ca...,[33],31,"(*, *, ent, *)","(*, *, {Lucy Carter}, *)",Spaces,latest,[High Line],[33],{33: 'High Line'},1,1,{},
581,What is the most recent location where Olivia ...,[38],31,"(*, *, ent, *)","(*, *, {Olivia Turner}, *)",Spaces,latest,[Williamsburg Bridge],[38],{38: 'Williamsburg Bridge'},1,1,{},
590,What was Jackson Ramos doing the last time the...,[18],32,"(*, *, ent, *)","(*, *, {Jackson Ramos}, *)",Event contents,latest,[Theater Performance],[18],{18: 'Theater Performance'},1,1,{},
592,What was Lucy Carter doing the last time they ...,[33],32,"(*, *, ent, *)","(*, *, {Lucy Carter}, *)",Event contents,latest,[Fire Dancing Performance],[33],{33: 'Fire Dancing Performance'},1,1,{},


In [18]:
df_all_questions = my_benchmark.df_qa
df_all_questions  # display in notebook

# --- Save all questions to CSV ---
output_csv_all = "/content/all_questions.csv"
df_qa.to_csv(output_csv_all, index=False)

print(f"Saved {len(df_all_questions)} questions to {output_csv_all}")

Saved 35 questions to /content/all_questions.csv
