In [1]:
# Load JSONL file (one JSON object per line)

import json

def load_jsonl(path):
    """Load JSONL file (one JSON object per line)."""
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records

data = load_jsonl("stories_with_outlines_first3000.jsonl")
# Print the first json object
print(data[0])

{'id': 1, 'story': 'Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\nSam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\nThey went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could hear them. They were sad and scared, and they never got out of the pit.', 'outline': {'title': 'Tom and the Pit', 'characters': ['Tom', 'Sam'], 'setting': ['A warm and sunny place', 'A big pit'], 'events': {'e1_6a': {'rev': 1, 'summary': '

In [2]:
# Find good masking, continguos spans of text to mask out
def find_good_masking(input):
    events = input["outline"]["events"]
    good_masking = []
    for eid, event in events.items():
        if is_contiguous(event["from_lines"]):
            good_masking.append((eid, event["from_lines"]))
    return good_masking

# Check if line indices are contiguous
def is_contiguous(line_indices):
    sorted_indices = sorted(line_indices)
    return all(b - a == 1 for a, b in zip(sorted_indices, sorted_indices[1:]))

# Example usage of is_contiguous
print(is_contiguous([2,3,4]))  # True
print(is_contiguous([1,3,4]))  # False

# Example usage of find_good_masking
good_maskings = find_good_masking(data[0])
print(good_maskings)  # List of (event_id, from_lines) with contiguous

True
False
[('e1_6a', [1, 2]), ('e0_f8', [3, 4]), ('ea_60', [5, 6]), ('e2_e2', [7, 8, 9]), ('e9_1a', [10, 11]), ('e8_3d', [12, 13, 14]), ('e4_d0', [15, 16])]


In [3]:
import re

# Prepare the sentences from the story text into a list
def prepare_sentences(input):
    # Extract story into list of sentences
    # Remove \n 
    story_text = input["story"].replace("\n", " ").strip()
    # Split by ". " , "! " to get sentences
    story_lines = [line.strip() + '.' for line in re.split(r'(?<=[.!?]) +', story_text) if line]
    story_lines = [re.sub(r'([.!?])\1+$', r'\1', line) for line in story_lines]
    return story_lines

# Example usage of prepare_sentences
story_lines = prepare_sentences(data[0])
print(story_lines)  # List of sentences in the story

['Once upon a time, in a warm and sunny place, there was a big pit.', 'A little boy named Tom liked to play near the pit.', 'One day, Tom lost his red ball.', 'He was very sad.', 'Tom asked his friend, Sam, to help him search for the ball.', 'They looked high and low, but they could not find the ball.', 'Tom said, "I think my ball fell into the pit." Sam and Tom went close to the pit.', 'They were scared, but they wanted to find the red ball.', 'They looked into the pit, but it was too dark to see.', 'Tom said, "We must go in and search for my ball." They went into the pit to search.', 'It was dark and scary.', 'They could not find the ball.', 'They tried to get out, but the pit was too deep.', 'Tom and Sam were stuck in the pit.', 'They called for help, but no one could hear them.', 'They were sad and scared, and they never got out of the pit.']


In [4]:
# Debugging prepare_sentences
for i in [0, 67, 345, 2263, 432]:
    story_lines = prepare_sentences(data[i])
    # print(story_lines)  # List of sentences in the story
    # print(len(story_lines))  # Number of sentences
    # Check if number of sentences match with from_lines indices of the last event
    for eid, event in data[i]["outline"]["events"].items():
        max_index = max(event["from_lines"])
        # Only print if last loop
        if eid == list(data[i]["outline"]["events"].keys())[-1]:
            print(f"Event {eid} max from_line index: {max_index}, total story lines: {len(story_lines)}")

'''
story_lines = prepare_data(data[345])
# print line by line of story_lines
for i, line in enumerate(story_lines):
    print(f"{i+1}: {line}")

# print line by line of event from_lines for data[345] with summary
for eid, event in data[345]["outline"]["events"].items():
    print(f"Event {eid} from_lines: {event['from_lines']}")
    print(f"Event {eid} summary: {event['summary']}")
'''

Event e4_d0 max from_line index: 16, total story lines: 16
Event e4_90 max from_line index: 12, total story lines: 12
Event e4_ff max from_line index: 22, total story lines: 23
Event e5_57 max from_line index: 11, total story lines: 11
Event ea_63 max from_line index: 12, total story lines: 12


'\nstory_lines = prepare_data(data[345])\n# print line by line of story_lines\nfor i, line in enumerate(story_lines):\n    print(f"{i+1}: {line}")\n\n# print line by line of event from_lines for data[345] with summary\nfor eid, event in data[345]["outline"]["events"].items():\n    print(f"Event {eid} from_lines: {event[\'from_lines\']}")\n    print(f"Event {eid} summary: {event[\'summary\']}")\n'

In [5]:
def mask_span(story_lines, from_lines):
    mask = []
    mask.append("[MASK_START]")
    # 0 based
    start = from_lines[0] - 1
    end = from_lines[-1] - 1
    mask.extend(story_lines[start:end+1])
    mask.append("[MASK_END]")
    return mask


# Add event function for insertion and modification masking
def add_event(input, event_id):

    masked_story_lines = []
    # Add conditioning token <ADD>
    masked_story_lines.append("<ADD>")

    # Add the outline summary
    masked_story_lines.append("<START_OUTLINE>")
    for eid in input["outline"]["sequence"]:
        summary = input["outline"]["events"][eid]["summary"].strip()
        masked_story_lines.append("<BOE>")  # Beginning of event
        masked_story_lines.append(summary)
        masked_story_lines.append("<EOE>")  # End of event
    masked_story_lines.append("<END_OUTLINE>")

    # Add the story lines
    masked_story_lines.append("<START_STORY>")
    story_lines = prepare_sentences(input)
    # Copy story_lines until from_lines, then add [MASK], then copy rest of story_lines
    # Mask out the event lines by adding [MASK_START] and [MASK_END] between the ground truth
    from_lines = input["outline"]["events"][event_id]["from_lines"]
    masked_span = mask_span(story_lines, from_lines)
    start = from_lines[0] - 1
    end = from_lines[-1] - 1
    masked_story_lines.extend(story_lines[:start])
    masked_story_lines.extend(masked_span)
    masked_story_lines.extend(story_lines[end+1:])
    masked_story_lines.append("<END_STORY>")

    # Join the masked_story_lines into a single input text
    input_text = " ".join(masked_story_lines)
    
    # ground truth label
    target_text = masked_span[1:-1]  # Exclude [MASK_START] and [MASK_END]
    target_text = " ".join(target_text)

    return input_text, target_text

# Example usage of add_event
# Get an event_id from data[0]
print(find_good_masking(data[0]))
input_text, target_text = add_event(data[0], find_good_masking(data[0])[3][0])
print("Input Text:")
print(input_text)
print("Input Text Masked Span:")
print(input_text.split("[MASK_START]")[1].split("[MASK_END]")[0].strip())
print("Target Text:")
print(target_text)

[('e1_6a', [1, 2]), ('e0_f8', [3, 4]), ('ea_60', [5, 6]), ('e2_e2', [7, 8, 9]), ('e9_1a', [10, 11]), ('e8_3d', [12, 13, 14]), ('e4_d0', [15, 16])]
Input Text:
<ADD> <START_OUTLINE> <BOE> Tom enjoys playing near a big pit in a sunny location. <EOE> <BOE> Tom loses his red ball and feels very sad about it. <EOE> <BOE> Tom asks his friend Sam for help, and they search for the ball without success. <EOE> <BOE> Tom suspects the ball fell into the pit, and he and Sam approach it despite their fear. <EOE> <BOE> Determined to find the ball, Tom and Sam decide to go into the dark pit. <EOE> <BOE> Inside the pit, they search but cannot find the ball and realize they are stuck. <EOE> <BOE> Tom and Sam call for help, but no one hears them, leaving them sad and scared. <EOE> <END_OUTLINE> <START_STORY> Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad. Tom asked his friend, Sam, to hel

In [26]:
# Delete event function for deletion masking
def delete_event(input, event_id):

    masked_story_lines = []
    # Add conditioning token <DELETE>
    masked_story_lines.append("<DELETE>")

    # Add the outline summary
    masked_story_lines.append("<START_OUTLINE>")
    for eid in input["outline"]["sequence"]:
        # Skip the event to be deleted
        if eid == event_id:
            continue
        summary = input["outline"]["events"][eid]["summary"].strip()
        masked_story_lines.append("<BOE>")  # Beginning of event
        masked_story_lines.append(summary)
        masked_story_lines.append("<EOE>")  # End of event
    masked_story_lines.append("<END_OUTLINE>")

    # Add the story lines
    masked_story_lines.append("<START_STORY>")
    story_lines = prepare_sentences(input)
    # Copy story_lines until from_lines, then add [MASK], then copy rest of story_lines
    # Mask out the event lines by adding [MASK_START] and [MASK_END] between the ground truth
    from_lines = input["outline"]["events"][event_id]["from_lines"]
    masked_span = mask_span(story_lines, from_lines)
    start = from_lines[0] - 1
    end = from_lines[-1] - 1
    masked_story_lines.extend(story_lines[:start])
    masked_story_lines.extend(masked_span)
    masked_story_lines.extend(story_lines[end+1:])
    masked_story_lines.append("<END_STORY>")

    # Join the masked_story_lines into a single input text
    input_text = " ".join(masked_story_lines)
    
    # ground truth label
    target_text = masked_span[1:-1]  # Exclude [MASK_START] and [MASK_END]
    target_text = " ".join(target_text)

    return input_text, target_text

# Example usage of add_event
# Get an event_id from data[i]
i = 345
print(find_good_masking(data[i]))
event_id = find_good_masking(data[i])[3][0]  # Example event_id
input_text, target_text = delete_event(data[i], event_id)
# Check if deleted event is gone
print("Deleted Event Summary:")
print(data[i]["outline"]["events"][event_id]["summary"])
print(input_text)

[('ed_e9', [1, 2, 3]), ('ed_a2', [4, 5, 6]), ('ee_75', [7, 8, 9, 10]), ('e9_6c', [11, 12, 13]), ('e8_15', [14, 15, 16, 17]), ('e4_78', [18, 19, 20, 21]), ('e4_ff', [22])]
Deleted Event Summary:
Tim forgot to watch his coat while playing, and when it was time to go home, he discovered it was missing.
<DELETE> <START_OUTLINE> <BOE> In a small house, there was a funny coat that was red, blue, and green, making people laugh. <EOE> <BOE> A little boy named Tim loved to wear his favorite funny coat every day. <EOE> <BOE> One sunny day, Tim played with his friends in the park, enjoying running, jumping, and laughing. <EOE> <BOE> Despite searching everywhere with his friends, Tim could not find his lost funny coat and felt very sad. <EOE> <BOE> Tim went home without his coat, and even though his parents tried to help, the coat was gone forever. <EOE> <BOE> Tim learned to remember to watch his things, but he missed his funny coat every day. <EOE> <END_OUTLINE> <START_STORY> Once upon a time, in

In [20]:
import re

def _block(text, start_tag, end_tag):
    i = text.find(start_tag)
    j = text.find(end_tag, i + len(start_tag)) if i != -1 else -1
    return text[i+len(start_tag):j] if (i != -1 and j != -1) else ""

def _norm(s):
    return " ".join(s.split())

def deleted_outline_absent(input_text, rec, event_id) -> bool:
    """Return True iff the deleted event's summary is NOT present in the outline block."""
    deleted_summary = rec["outline"]["events"][event_id]["summary"]
    outline = _block(input_text, "<START_OUTLINE>", "<END_OUTLINE>")
    # Extract every <BOE> ... <EOE> summary inside the outline
    summaries = re.findall(r"<BOE>\s*(.*?)\s*<EOE>", outline, flags=re.DOTALL)
    del_norm = _norm(deleted_summary)
    # Check normalized equality against each printed summary
    return all(_norm(s) != del_norm for s in summaries)

# --- Example usage ---
for i in [0, 67, 345, 2263, 432]:
    event_id = find_good_masking(data[i])[3][0]
    input_text, target_text = delete_event(data[i], event_id)

    ok = deleted_outline_absent(input_text, data[i], event_id)
    print("Deleted event absent from outline?", ok)  # Expect True


Deleted event absent from outline? True
Deleted event absent from outline? True
Deleted event absent from outline? True
Deleted event absent from outline? True
Deleted event absent from outline? True


In [7]:
# Run add_event and delete_event and append to a dataset

for i in range(len(data)):
    find_good_maskings = find_good_masking(data[i])
    # Do at least most 5 add_event
    for j in range(min(5, len(find_good_maskings))):
        event_id = find_good_maskings[j][0]
        input_text, target_text = add_event(data[i], event_id)
        # Append to dataset (not implemented here)
    # Do at least most 3 delete_event
    for j in range(min(3, len(find_good_maskings))):
        event_id = find_good_maskings[j][0]
        input_text, target_text = delete_event(data[i], event_id)
        # Append to dataset (not implemented here)

In [8]:
records = []

for rec in data:
    good_events = find_good_masking(rec)

    for eid, _ in good_events[:5]:
        input_text, target_text = add_event(rec, eid)
        records.append({
            "mask_type": "insertion/modification",
            "input_text": input_text,
            "target_text": target_text
        })

    for eid, _ in good_events[:3]:
        input_text, target_text = delete_event(rec, eid)
        records.append({
            "mask_type": "deletion",
            "input_text": input_text,
            "target_text": target_text
        })

# Check one record
print(records[0])

{'mask_type': 'insertion/modification', 'input_text': '<ADD> <START_OUTLINE> <BOE> Tom enjoys playing near a big pit in a sunny location. <EOE> <BOE> Tom loses his red ball and feels very sad about it. <EOE> <BOE> Tom asks his friend Sam for help, and they search for the ball without success. <EOE> <BOE> Tom suspects the ball fell into the pit, and he and Sam approach it despite their fear. <EOE> <BOE> Determined to find the ball, Tom and Sam decide to go into the dark pit. <EOE> <BOE> Inside the pit, they search but cannot find the ball and realize they are stuck. <EOE> <BOE> Tom and Sam call for help, but no one hears them, leaving them sad and scared. <EOE> <END_OUTLINE> <START_STORY> [MASK_START] Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. [MASK_END] One day, Tom lost his red ball. He was very sad. Tom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the

In [9]:
# Convert records to Hugging Face Dataset
from datasets import Dataset

dataset = Dataset.from_list(records)
print(dataset)

Dataset({
    features: ['mask_type', 'input_text', 'target_text'],
    num_rows: 22971
})


In [10]:
# Save to disk (Arrow format folder)
dataset.save_to_disk("masked_dataset")

# Reload later
from datasets import load_from_disk
dataset = load_from_disk("masked_dataset")

# Peek
print(dataset[0])

Saving the dataset (0/1 shards):   0%|          | 0/22971 [00:00<?, ? examples/s]

{'mask_type': 'insertion/modification', 'input_text': '<ADD> <START_OUTLINE> <BOE> Tom enjoys playing near a big pit in a sunny location. <EOE> <BOE> Tom loses his red ball and feels very sad about it. <EOE> <BOE> Tom asks his friend Sam for help, and they search for the ball without success. <EOE> <BOE> Tom suspects the ball fell into the pit, and he and Sam approach it despite their fear. <EOE> <BOE> Determined to find the ball, Tom and Sam decide to go into the dark pit. <EOE> <BOE> Inside the pit, they search but cannot find the ball and realize they are stuck. <EOE> <BOE> Tom and Sam call for help, but no one hears them, leaving them sad and scared. <EOE> <END_OUTLINE> <START_STORY> [MASK_START] Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. [MASK_END] One day, Tom lost his red ball. He was very sad. Tom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the

In [45]:
print(dataset)
print(dataset[0]["input_text"])
print(dataset[0]["target_text"])
# split train test
train_test = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test["train"]
test_dataset = train_test["test"]
print("Train dataset size:", len(train_dataset))
print("Test dataset size:", len(test_dataset))
print(train_test)

Dataset({
    features: ['mask_type', 'input_text', 'target_text'],
    num_rows: 22971
})
<ADD> <START_OUTLINE> <BOE> Tom enjoys playing near a big pit in a sunny location. <EOE> <BOE> Tom loses his red ball and feels very sad about it. <EOE> <BOE> Tom asks his friend Sam for help, and they search for the ball without success. <EOE> <BOE> Tom suspects the ball fell into the pit, and he and Sam approach it despite their fear. <EOE> <BOE> Determined to find the ball, Tom and Sam decide to go into the dark pit. <EOE> <BOE> Inside the pit, they search but cannot find the ball and realize they are stuck. <EOE> <BOE> Tom and Sam call for help, but no one hears them, leaving them sad and scared. <EOE> <END_OUTLINE> <START_STORY> [MASK_START] Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. [MASK_END] One day, Tom lost his red ball. He was very sad. Tom asked his friend, Sam, to help him search for the ball. They looked high 

In [39]:
from collections import Counter
import random

# Check how many records of each mask_type
mask_type_counts = Counter(dataset['mask_type'])
print(mask_type_counts)

# Random sample check
for i in range(5):
    idx = random.randint(0, len(dataset)-1)
    print(f"Sample {i+1}:")
    print(f"Mask Type: {dataset[idx]['mask_type']}")
    print(f"Input Text: {dataset[idx]['input_text']}")
    print(f"Target Text: {dataset[idx]['target_text']}")
    print()


# Check whether deletion is working correctly
# The outline should not contain the deleted event summary
for i in range(5):
    idx = random.randint(0, len(dataset)-1)
    record = dataset[idx]
    if record['mask_type'] == 'deletion':
        # Extract event summaries from input_text
        outline_start = record['input_text'].index("<START_OUTLINE>") + len("<START_OUTLINE>")
        outline_end = record['input_text'].index("<END_OUTLINE>")
        outline_text = record['input_text'][outline_start:outline_end].strip()

        # Check that target_text (deleted event summary) is not in outline_text
        if record['target_text'] in outline_text:
            print(f"Error: Deleted event summary found in outline for sample index {idx}")
        else:
            print(f"Deletion check passed for sample index {idx}")

Counter({'insertion/modification': 14034, 'deletion': 8937})
Sample 1:
Mask Type: insertion/modification
Input Text: <ADD> <START_OUTLINE> <BOE> Cindy, an ordinary girl, enjoyed playing with her toys and singing songs. <EOE> <BOE> One day, she decided to explore and opened the door to her room, but it was too dark. <EOE> <BOE> Remembering her mom's permission, she opened the wardrobe door, which creaked. <EOE> <BOE> Behind the wardrobe was a doorway, and Cindy was delighted and excited to enter. <EOE> <BOE> Stepping through the doorway, she was amazed by brightly colored lights and many things to explore. <EOE> <BOE> Cindy spent the whole day discovering new places and having fun. <EOE> <BOE> When night came, she returned to her bedroom, tired but happy, where Mom was waiting. <EOE> <BOE> Mom expressed joy over Cindy's fun day and suggested they do it again tomorrow, thrilling Cindy. <EOE> <END_OUTLINE> <START_STORY> Cindy was an ordinary girl. She enjoyed playing with her toys and sin

In [12]:
# Each record:
#   id: int
#   story: str (full story; lines/sentences)
#   outline: {
#     title, characters, setting (optional),
#     events: {event_id: {"rev": int, "summary": str, "from_lines": [int,...]}},
#     sequence: [event_id, ...],
#     ending: {summary: str} (optional)
#   }

In [13]:
# ============================================
# REVISED DATA PREP SCHEMA (NO CODE)
# ============================================

# GOAL
# - Produce a DataFrame with exactly 3 columns:
#     mask_type   : "insertion" | "deletion" | "modification"
#     input_text  : single concatenated string (CONTROL + OUTLINE + STORY with one mask)
#     target_text : the span that was masked out (gold text)
# - Unified objective: predict the text between [MASK_START] ... [MASK_END].
# - No anchor-bounded masking (for now). One mask per sample.

# --------------------------------------------
# 0) SPECIAL TOKENS
# --------------------------------------------
# Control: <ADD>, <DELETE>
# Structure: <START_OUTLINE>, <END_OUTLINE>, <START_STORY>, <END_STORY>, <BOE>, <EOE>
# Mask: [MASK_START], [MASK_END]
# Ensure tokens are isolated by spaces in serialization.

# --------------------------------------------
# 1) INPUT JSON ASSUMPTIONS (for prep only)
# --------------------------------------------
# Each record:
#   id: int
#   story: str (full story; lines/sentences)
#   outline: {
#     title, characters, setting (optional),
#     events: {event_id: {"rev": int, "summary": str, "from_lines": [int,...]}},
#     sequence: [event_id, ...],
#     ending: {summary: str} (optional)
#   }
# NOTE: sequence/rev are used to CHOOSE spans and simulate mods, but are NOT serialized into input_text.

# --------------------------------------------
# 2) CANONICALIZATION
# --------------------------------------------
# - Split story into the same sentence/line scheme that produced "from_lines".
# - For each event_id in outline.sequence, confirm its sentences form a contiguous span.
# - Skip/flag non-contiguous or unmapped events.

# --------------------------------------------
# 3) MASK TYPES (NO ANCHOR-BOUNDED)
# --------------------------------------------
# (A) INSERTION
#   - Choose an existing event e_ins to "pretend-insert".
#   - Remove e_ins sentences from the story and place [MASK_START] [MASK_END] at that exact location.
#   - Keep the outline unchanged.
#   - CONTROL = <ADD>, mask_type = "insertion"
#   - target_text = original e_ins sentences.

# (B) DELETION
#   - Choose an event e_del.
#   - In the outline SERIALIZATION, omit e_del (do not print it).
#   - In the story, replace e_del sentences with [MASK_START] [MASK_END].
#   - CONTROL = <DELETE>, mask_type = "deletion"
#   - target_text = original e_del sentences (train model to learn concise bridging despite outline gap).

# (C) MODIFICATION
#   - Choose an event e_mod.
#   - Keep outline order; optionally tweak the printed summary text to simulate the modification
#     (e.g., mention a changed detail). You do NOT need to print "rev".
#   - In the story, replace e_mod sentences with [MASK_START] [MASK_END].
#   - CONTROL = <ADD>, mask_type = "modification"
#   - target_text = original e_mod sentences (model learns to rewrite per updated outline summary).

# --------------------------------------------
# 4) OUTLINE SERIALIZATION (INPUT SIDE)
# --------------------------------------------
# input_outline_block =
#   "<START_OUTLINE>\n" +
#   join([
#     f"<BOE> {events[eid].summary} <EOE>"
#     for eid in serialized_sequence
#   ], " ") +
#   " <END_OUTLINE>"
#
# where:
#   - serialized_sequence = original sequence
#   - for DELETION, drop e_del from serialized_sequence
#   - for MODIFICATION, keep eid but you may alter its summary text (not required)

# --------------------------------------------
# 5) STORY SERIALIZATION (INPUT SIDE)
# --------------------------------------------
# story_with_mask = original story text with exactly one event span replaced by:
#   "[MASK_START] [MASK_END]"
#
# Wrap:
#   input_story_block = f"<START_STORY> {story_with_mask} <END_STORY>"

# --------------------------------------------
# 6) CONTROL TOKEN PLACEMENT
# --------------------------------------------
# control_prefix = "<ADD>" for insertion/modification; "<DELETE>" for deletion.
# input_text = f"{control_prefix} {input_outline_block} {input_story_block}"

# --------------------------------------------
# 7) TARGET (OUTPUT)
# --------------------------------------------
# target_text = the exact gold text removed from the masked span (verbatim).

# --------------------------------------------
# 8) PER-STORY AUGMENTATION
# --------------------------------------------
# - Create K variants per story (e.g., 4–8):
#   ~3 insertion, ~1–2 modification, ~1–3 deletion.
# - Prefer middle events; down-weight first/last.
# - Enforce one mask per example; cap target length if needed.

# --------------------------------------------
# 9) FINAL DATAFRAME
# --------------------------------------------
# Columns (exactly):
#   mask_type   : str  # "insertion" | "deletion" | "modification"
#   input_text  : str  # CONTROL + OUTLINE + STORY(with [MASK_START][MASK_END])
#   target_text : str  # gold span
#
# (Optional during prep/debug; not required on disk)
#   story_id, control_tag, event_ids_in_mask, input_len, target_len

# --------------------------------------------
# 10) STORAGE
# --------------------------------------------
# Save as Parquet/Arrow (snappy or zstd). Keep a tiny JSONL sample for spot checks.

# --------------------------------------------
# 11) INFERENCE PIPELINE MIRROR
# --------------------------------------------
# - Build input the same way:
#   * choose CONTROL (<ADD> or <DELETE>),
#   * serialize the outline you intend (e.g., with a new event summary or with an event removed),
#   * insert a single [MASK_START] [MASK_END] at the intended location in the story context,
#   * feed input_text to the model and read generated span as the answer.
# - No target_text at inference, obviously.
# ============================================
