# Notebook to test story JSON outliner

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import torch
from gensim.models import Word2Vec
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
)
import json
import re
import numpy as np
import pandas as pd
import random
import hashlib
import os

# Make sure the required NLTK data packages are available
# Dynamically construct NLTK data directory relative to current working directory
nltk_data_dir = os.path.join(os.getcwd(), "venv", "nltk_data")

# Create directory if it doesn't exist
os.makedirs(nltk_data_dir, exist_ok=True)

nltk.download("punkt", download_dir=nltk_data_dir)
nltk.download('punkt_tab', download_dir=nltk_data_dir)
nltk.download("stopwords", download_dir=nltk_data_dir)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to d:\Keven(Work)\dsa4213\final
[nltk_data]     project\venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     d:\Keven(Work)\dsa4213\final project\venv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     d:\Keven(Work)\dsa4213\final project\venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Config
# ==============================
DATA_FILE = "stories_with_outlines_first3000.jsonl"
EVENT_MODEL = "./Event-summariser-LoRA-v1"
ENDING_MODEL = "./Ending-summariser-LoRA-v2"
QA_MODEL = "./QA-LoRA-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Prepare tokenizers and models
print(f"Loading event summariser model: {EVENT_MODEL}")
event_model = AutoModelForSeq2SeqLM.from_pretrained(EVENT_MODEL).to(DEVICE)
event_tokenizer = AutoTokenizer.from_pretrained(EVENT_MODEL)
print(f"Loading ending summariser model: {ENDING_MODEL}")
ending_model = AutoModelForSeq2SeqLM.from_pretrained(ENDING_MODEL).to(DEVICE)
ending_tokenizer = AutoTokenizer.from_pretrained(ENDING_MODEL)
print(f"Loading QA model: {QA_MODEL}")
qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL).to(DEVICE)
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL)

Loading event summariser model: ./Event-summariser-LoRA-v1
Loading ending summariser model: ./Ending-summariser-LoRA-v2
Loading QA model: ./QA-LoRA-v2


# Data Preview
(Reformatted from actual json outline to compare against predicted outputs)

In [4]:
print(f"Loading dataset: {DATA_FILE}")
# Read all lines as JSON
mockdata = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
    for line in f:
        each_line = json.loads(line)
        # extract story text
        story = each_line['story']
        # extract json outline
        outline = each_line['outline']
        # append to dataset
        mockdata.append({"story": story, "outline": outline})
        
mockdata[:3] # preview first 3 entries 

Loading dataset: stories_with_outlines_first3000.jsonl


[{'story': 'Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\nSam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\nThey went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could hear them. They were sad and scared, and they never got out of the pit.',
  'outline': {'title': 'Tom and the Pit',
   'characters': ['Tom', 'Sam'],
   'setting': ['A warm and sunny place', 'A big pit'],
   'events': {'e1_6a': {'rev': 1,
     'sum

In [4]:
print(f"Loading dataset: {DATA_FILE}")
# Read all lines as JSON
dataset = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
    for line in f:
        each_line = json.loads(line)
        # extract story text
        story = each_line['story']
        # extract and reformat outline details: events + title
        outline = each_line['outline']
        title = outline.get('title', '')
        events = outline.get('events', [])
        full_summary = ""
        for key, value in events.items():
            summary = value.get('summary')
            full_summary += f"{summary} "
        # extract ending & settings & characters
        ending = outline['ending'].get('summary')
        char = outline.get('characters')
        setting = outline.get('setting')

        # append to dataset
        dataset.append({"story": story, "events": full_summary,
                         "title": title, "settings": setting, "characters": char, "ending": ending})


dataset[:3] # preview first 3 entries 

Loading dataset: stories_with_outlines_first3000.jsonl


[{'story': 'Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\nSam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\nThey went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could hear them. They were sad and scared, and they never got out of the pit.',
  'events': 'Tom enjoys playing near a big pit in a sunny location. Tom loses his red ball and feels very sad about it. Tom asks his friend Sam for help, and they search for

# Functions

In [5]:
# function to count sentences
def count_sentences(text):
    sentences = sent_tokenize(text)
    return len(sentences)

# helper: average word vectors for a sentence
def sentence_vector(sentence, model):
    words = [w for w in word_tokenize(sentence.lower()) if w in model.wv]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

# helper: cosine similarity
def cosine_sim(vec1, vec2):
    if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
        return 0
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# main function: semantic chunking
def split_story_by_similarity(story: str, model: Word2Vec, threshold: float = 0.7) -> list:
    sentences = sent_tokenize(story)
    if len(sentences) <= 1:
        return [story]
    chunks = []
    from_lines = []
    current_chunk = [sentences[0]]
    current_lines = [1]  # Start tracking line numbers from 1
    prev_vec = sentence_vector(sentences[0], model)
    for i in range(1, len(sentences)):
        curr_vec = sentence_vector(sentences[i], model)
        sim = cosine_sim(prev_vec, curr_vec)
        if sim < threshold:
            # new context detected â†’ start new chunk
            chunks.append(" ".join(current_chunk))
            # record line numbers (1-based) of sentences in this chunk
            from_lines.append(", ".join(str(x) for x in current_lines))
            current_chunk = [sentences[i]]
            current_lines = [i + 1]  # Reset with current line number
        else:
            current_chunk.append(sentences[i])
            current_lines.append(i + 1)  # Add line number to current chunk
        prev_vec = curr_vec
    # add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        from_lines.append(", ".join(str(x) for x in current_lines))
    return chunks, from_lines


In [11]:
# helper: generate short event ID
def short_event_id(summary: str) -> str:
    """Deterministic short ID from summary text."""
    h = hashlib.sha1(summary.encode("utf-8")).hexdigest()
    return f"e{h[:1]}_{h[1:3]}"

# helper: convert str into list of str
def text_to_list(text: str):
    # Split by commas
    items = [item.strip() for item in text.split(',')]
    # Remove extra whitespace & capitalize properly
    items = [re.sub(r'\s+', ' ', item).strip().title() for item in items if item.strip()]
    return list(set(items)) # to remove duplicate elements

# helper: generate event summary
def summarise_this_event(event_model, event_tokenizer, chunk: str, device) -> str:
    prompt = f"Summarise this text:\n{chunk}\n"
    inputs = event_tokenizer(prompt, return_tensors="pt").to(device)
    outputs = event_tokenizer.decode(
        event_model.generate(**inputs, max_new_tokens=128)[0], skip_special_tokens=True 
    )
    return outputs

# helper: generate story title
def generate_title(qa_model, qa_tokenizer, story: str, device) -> str:
    title_prompt = f"Question: What is a good title for this story? \nStory: {story}"
    title_input = qa_tokenizer(title_prompt, return_tensors="pt").to(device)
    title_output = qa_tokenizer.decode(
        qa_model.generate(**title_input, max_new_tokens=64)[0], skip_special_tokens=True       
    )
    return title_output

# helper: generate story characters
def generate_char(qa_model, qa_tokenizer, story: str, device) -> list[str]:
    char_prompt = f"Question: Who are the characters in this story? \nStory: {story}"
    char_input = qa_tokenizer(char_prompt, return_tensors="pt").to(device)
    char_output = qa_tokenizer.decode(
        qa_model.generate(**char_input, max_new_tokens=64)[0], skip_special_tokens=True       
    )
    char_output = text_to_list(char_output)
    return char_output

# helper: generate story settings
def generate_setting(qa_model, qa_tokenizer, story: str, device) -> list[str]:
    settings_prompt = f"Question: What are all the settings in this story? \nStory: {story}"
    settings_input = qa_tokenizer(settings_prompt, return_tensors="pt").to(device)
    settings_output = qa_tokenizer.decode(
        qa_model.generate(**settings_input, max_new_tokens=64)[0], skip_special_tokens=True       
    )
    settings_output = text_to_list(settings_output)
    return settings_output

# helper: generate story ending
def generate_ending(ending_model, ending_tokenizer, story: str, device) -> str:
    ending_prompt = f"Extract the ending for this text: \n{story}"
    ending_input = ending_tokenizer(ending_prompt, return_tensors="pt").to(device)
    ending_output = ending_tokenizer.decode(
        ending_model.generate(**ending_input, max_new_tokens=128)[0], skip_special_tokens=True       
    )
    return ending_output

# Testing Actual JSON outliner

In [12]:
# Test run everything except events(in the next cell)
INDEX = random.randint(0, 2999)
story = dataset[INDEX]["story"]
event = dataset[INDEX]["events"]
title = dataset[INDEX]["title"]
characters = dataset[INDEX]["characters"]
settings = dataset[INDEX]["settings"]
ending = dataset[INDEX]["ending"]
dashline = "-" * 80
print(dashline)

# Compare original and predicted title
print(f"Original Story Title: {title}")
pred_title = generate_title(qa_model, qa_tokenizer, story, DEVICE)
print(f"Generated Model title: {pred_title}\n")
print(dashline)

# Compare original and predicted characters
print(f"Original Characters: {characters}")
pred_char = generate_char(qa_model, qa_tokenizer, story, DEVICE)
print(f"Generated Characters: {pred_char}\n")
print(dashline)

# Compare original and predicted settings
print(f"Original Settings: {settings}")
pred_settings = generate_setting(qa_model, qa_tokenizer, story, DEVICE)
print(f"Generated Settings: {pred_settings}\n")
print(dashline)

# Compare original and predicted ending
print(f"Original Ending: {ending}")
pred_ending = generate_ending(ending_model, ending_tokenizer, story, DEVICE)
print(f"Generated Ending: {pred_ending}\n")
print(dashline)



--------------------------------------------------------------------------------
Original Story Title: Blink and Blue
Generated Model title: A Beautiful oasis

--------------------------------------------------------------------------------
Original Characters: ['Blink', 'Blue']
Generated Characters: ['Blink', 'Blue']

--------------------------------------------------------------------------------
Original Settings: ['a large, hot desert', 'an oasis']
Generated Settings: ['Oasis', 'Hot Desert']

--------------------------------------------------------------------------------
Original Ending: Blink and Blue's friendship blossomed as they enjoyed their time together in the oasis.
Generated Ending: Blue and Blink have been friends since the beginning of their friendship.

--------------------------------------------------------------------------------


In [13]:
# Test run event summary
print(f"Number of outlined events based on actual JSON summary: {count_sentences(event)}\n")
sentences = [word_tokenize(s.lower()) for s in sent_tokenize(story)]
w2v_model = Word2Vec(sentences, vector_size=150, window=5, min_count=1, workers=2)


# MAYBE THIS COULD BE PART OF ABLATION STUDY
# THRESHOLD TUNING: if too many chunks, lower threshold; if too few, increase
THRESHOLD = 0.2

# split story into semantically coherent chunks
chunks, from_lines = split_story_by_similarity(story, w2v_model, threshold=THRESHOLD)

for i, c in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---")
    print(f"From line(s): {from_lines[i-1]}")
    print(c)
    chunk_summary = summarise_this_event(event_model, event_tokenizer, c, DEVICE)
    print(f"chunk summary: {chunk_summary}")
    

Number of outlined events based on actual JSON summary: 6

--- Chunk 1 ---
From line(s): 1, 2, 3, 4, 5, 6, 7
Once upon a time, in a large, hot desert, there was a beautiful oasis. In the oasis, lived a tiny bug named Blink. Blink was a happy bug who liked to play with his friends. The oasis was full of water, trees, and fun. One day, while Blink was playing, he met a big bird named Blue. Blue was very thirsty and asked Blink, "Where can I find water to drink?" Blink knew where the water was and wanted to help.
chunk summary: In a large desert, Blink is a happy bug who loves playing with his friends.
--- Chunk 2 ---
From line(s): 8, 9, 10, 11, 12, 13
He said, "Follow me, Blue, I will show you the water." So, Blue followed Blink to the water in the oasis. Blue was very happy to see the water and drank a lot. Blue thanked Blink and said, "You are a good friend, Blink." From that day on, Blink and Blue became best friends. They played together at the oasis every day, and everyone knew that

# JSON output

In [14]:
# Function that returns a structured json schema
def generate_json_outline(story,
                          qa_model=qa_model, qa_tokenizer=qa_tokenizer,
                          ending_model=ending_model, ending_tokenizer=ending_tokenizer,
                          event_model=event_model, event_tokenizer=event_tokenizer,
                          chunk_similarity_threshold=0.2, device=DEVICE):
    # JSON schema to be returned
    summary_json = {
        "title": None,
        "characters": None,
        "settings": None,
        "events": {},
        "sequence": [],
        "ending": None
    }
    # split story into semantically coherent chunks first
    sentences = [word_tokenize(s.lower()) for s in sent_tokenize(story)]
    w2v_model = Word2Vec(sentences, vector_size=150, window=5, min_count=1, workers=2)
    event_chunks, from_lines = split_story_by_similarity(story, w2v_model, chunk_similarity_threshold)
    # populate events and sequence
    for i in range(len(event_chunks)):
        # generate json entry for each event summary
        curr_chunk_summary = summarise_this_event(event_model, event_tokenizer, event_chunks[i], device)
        event_id = short_event_id(curr_chunk_summary)
        summary_json["events"][event_id] = {
            "rev": 1, # revision number will always be 1 for generated outlines
            "summary": curr_chunk_summary,
            # convert from_lines to list of integers
            "from_lines": list(map(int, from_lines[i].split(", ")))
        }
        # add event id to sequence
        summary_json["sequence"].append(event_id)
    # populate title, characters, settings & ending
    title = generate_title(qa_model, qa_tokenizer, story, device)
    char = generate_char(qa_model, qa_tokenizer, story, device) 
    setting = generate_setting(qa_model, qa_tokenizer, story, device)
    ending = generate_ending(ending_model, ending_tokenizer, story, device)
    summary_json["title"] = title
    summary_json["characters"] = char
    summary_json["settings"] = setting
    summary_json["ending"] = ending

    return summary_json

In [16]:
# Test JSON output generation
INDEX = random.randint(0, 2999)
story = dataset[INDEX]["story"]
THRESHOLD = 0.15 # edit this to obtain diff variation of event outlines
json_outline = generate_json_outline(story, chunk_similarity_threshold=THRESHOLD)
print(json.dumps(json_outline, indent=3))

{
   "title": "Tim and Sue's Peach",
   "characters": [
      "Sue",
      "Tim"
   ],
   "settings": [
      "The Store"
   ],
   "events": {
      "ed_37": {
         "rev": 1,
         "summary": "Tim is tired and goes to the store to buy a peach, and his friend Sue tells him to stay away.",
         "from_lines": [
            1,
            2,
            3,
            4
         ]
      },
      "e5_bd": {
         "rev": 1,
         "summary": "Tim and Sue walked to her bike, and they noticed the bike was broken.",
         "from_lines": [
            5,
            6,
            7,
            8,
            9
         ]
      },
      "e6_4c": {
         "rev": 1,
         "summary": "Sue is happy and unexpectedly happens.",
         "from_lines": [
            10,
            11
         ]
      },
      "e7_ea": {
         "rev": 1,
         "summary": "Tim picks up the peach and begins to talk, expressing his gratitude to Sue.",
         "from_lines": [
            12,
  