In [None]:
%load_ext autoreload
%autoreload 2

from meri.utils import setup_logging
import logging
from functools import lru_cache
from IPython.display import display_markdown
from meri.scraper import try_setup_requests_cache

logger = logging.getLogger(__name__)
#logging.basicConfig(level=logging.DEBUG)

try_setup_requests_cache()

In [None]:
import mwclient.page
from meri.wp import page_to_document
from meri.wp import mediawiki_html_to_markdown
import mwclient

site = mwclient.Site("en.wikipedia.org")
page = site.pages["Pet store"]

html_doc = page_to_document(page)
md_doc = mediawiki_html_to_markdown(html_doc)

md_doc.meta['revision'] = page.revision

display(md_doc)


In [None]:
from meri.pipelines.summation import LmmSummationPipeline

display(md_doc.meta)

farts = LmmSummationPipeline()(md_doc)
display(farts)

In [None]:
# Generate a structured tree with summaries. Summaries are generated with LLM
# TODO: Find alternatives for LLM summarization

SUMMARY_PROMPT = """
You are a helpful AI assistant for Klikkikuri service for generating a comprehensive summary of a Wikipedia article.

Your task is to summarize the given text in the <article> -section. Aim for the following:
- Summary __MUST__ be as non-opinionated as possible, and close to the original text.
- Summary __MUST__ be in the same language as the article.
- Summary __MAY NOT__ contain any additional information, context or commentary in the summary that is not in the text.
- Summary __NEEDS__ to contain the entities and relations of the text.
- Summary __NEEDS__ to contain the most important points of the text.
- Summary __SHOULD__ distill the essential points of the text.
- Summary __SHOULD__ be as short as feasible.
{# - Summary __SHOULD__ contain the most unique points of the text of the section. #}
- Summary __MAY ONLY__ include basic semantic formatting in markdown like bold or emphasis, but __MAY NOT__ include formatting like links, images, tables, headings, etc.
- Approach the task as extractive task, but format it as a abstract summary.
- Do __NOT__ add *any* preceding sentences (like "This is a summary of ...", "The section ...") or trailing sentences.
- The output should begin immediately with the summarized content.
- For sections listing sources or references, return `{{SKIP_TAG}}`.
- If no sensible summary can be generated, return `{{SKIP_TAG}}`.

You are to summarize following wikipedia article titled {{article_title|escape}}.
Section of the article to summarize is {{section_title|escape}} – keep the summary relevant to it.
Subsectioned content inside of `<summary>` -tags are previously summarized sections for context – focus on providing additional information to them, but do not repeat content that is provided already on them.

{# You MUST only produce the summary, no yapping, no explanations. #}

Article to summarize in markdown format:
<article>
{{text|indent}}
</article>
"""

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple
from haystack import Pipeline, component
from haystack.components.builders import PromptBuilder
from meri.llm import get_generator
from meri.wp import MarkdownChunker


def reduce_to_summary(md: str, lang="en"):
    """
    Reduce the given markdown text to a summary using the OpenAI LLM.

    Parse the markdown text into a tree structure, and then recursively summarize each node in the tree.
    """

    MAX_CONCURRENT_WORKERS: int = 3
    SKIP_TAG = "<skip>"
    doc_tree = MarkdownChunker(md, language=lang).parse()

    prompt_builder = PromptBuilder(SUMMARY_PROMPT)
    llm = get_generator()

    p = Pipeline(max_runs_per_component=1)
    p.add_component("prompt_builder", prompt_builder)
    p.add_component("llm", llm)
    p.connect("prompt_builder", "llm")

    def make_node_text(node, titles):
        section_titles = titles + [node['title']]
        section_titles_md = "\n\n".join(f"#{'#' * i} {_title}" for i, _title in enumerate(section_titles))
        return f"{section_titles_md}\n\n{node['body']}", section_titles

    @lru_cache(maxsize=128)
    def summarize_node_sync(text, titles) -> str:
        results = p.run({
            "prompt_builder": {
                "text": text,
                "article_title": titles[0],
                "section_title": titles[-1],
                "SKIP_TAG": SKIP_TAG,
            },
        })
        if "llm" in results:
            return results["llm"]["replies"][0]
        raise RuntimeError("No summary generated")

    
    def generate_summary(node, titles, executor, futures_map):
        print(f"{' '*len(titles)}|- node: {node['title']}")
        new_titles = titles + [node["title"]]
        node_text, full_titles = make_node_text(node, titles)

        if node['children']:
            # Recurse first (sequentially), collect child summaries later
            for child in node['children']:
                generate_summary(child, new_titles, executor, futures_map)

            # After all children are scheduled, combine summaries
            subsection_summaries = []
            for child in node['children']:
                child_summary = child.get("summary")
                subsection_summaries.append(child_summary)

            print(f" >{'-'*len(titles)} {len(subsection_summaries)} summaries for section: {node['title']}")
            sections = [node_text]
            subsection_titles = (f"#{'#' * (len(new_titles) + 1)} {_title}" for _title in titles[1:])
            for title, summary in zip(subsection_titles, subsection_summaries):
                if summary.strip() == SKIP_TAG:
                    logger.debug(f"Skipping summary for {title} due to SKIP_TAG")
                    continue

                sections += [f"{title}\n\n<summary>{summary}</summary>"]
            node_text = "\n\n".join(sections)

        # Submit summarization task
        future = executor.submit(summarize_node_sync, node_text, tuple(new_titles))
        futures_map[future] = node

    with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_WORKERS) as executor:
        futures_map = {}
        generate_summary(doc_tree, [], executor, futures_map)

        for future in as_completed(futures_map):
            node = futures_map[future]
            res = future.result()
            if res.strip() == SKIP_TAG:
                logger.debug(f"Skipping summary for {node['title']} due to SKIP_TAG")
                continue
            node['summary'] = res

    return doc_tree

summarized_docs = reduce_to_summary(md_doc.content, lang="en")

#display_markdown(reduce_to_summary(docs[0].content, lang="en"), raw=True)


In [None]:
# Convert parsed tree into documents, where Document contains a leading summaries and section body.

from copy import deepcopy
from typing import Iterator
from haystack import Document

def _get_branch(tree, branches_path: List[int]) -> Iterator:
    node = tree
    yield node
    for branch in branches_path: 
        node = node['children'][branch]
        yield node


def tree_to_doc(tree, base_doc: Document, branches_path: List[int] = []):
    """
    Convert the tree structure into a list of documents.
    """

    docs = []

    branches = list(_get_branch(tree, branches_path))
    # Collect summaries
    content = []
    for branch in branches[:-1]:
        content.append(f"{'#' * branch['level']} {branch['title']}\n\n{branch['summary']}")
    
    # Get leaf full body
    branch = branches[-1]
    content.append(f"{'#' * branch['level']} {branch['title']}\n\n{branch['body']}")

    doc = deepcopy(base_doc)
    doc.content = "\n\n".join(content).strip()
    doc.meta['title'] += f" > {branch['title']}"
    doc._create_id()
    if doc.content:
        docs.append(doc)

    # Recurse into child
    for i, lead in enumerate(branch['children']):
        docs += tree_to_doc(tree, doc, branches_path + [i])
    return docs

base_doc = deepcopy(md_doc)

base_doc.meta['title'] = "Wikipedia"

docs = tree_to_doc(summarized_docs, base_doc)
print("Extracted document len:", len(docs))
