In [1]:
%load_ext autoreload
%autoreload 2

from meri.utils import setup_logging
import logging
from functools import lru_cache
from IPython.display import display_markdown

logger = logging.getLogger(__name__)

logging.basicConfig()

In [3]:
from datetime import datetime
import requests
import requests.adapters
from meri.extractor._processors import html_to_markdown
from haystack import Document

from meri.scraper import get_user_agent, try_setup_requests_cache

try_setup_requests_cache()

# Set request session
session = requests.Session()
retries = requests.adapters.Retry()
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=retries))
session.headers.update({
    "User-Agent": get_user_agent(),
})

search_term = "Climate Change"
search_url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srnamespace=0&format=json"
page_url = "https://en.wikipedia.org/?action=render"
search_response = session.get(search_url, params={
    "srsearch": search_term
})
data = search_response.json()
docs = []

def page_summary(page_ids: list[int]):
    page_url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&redirects=1&exintro=1&explaintext=1"
    page_response = session.get(page_url, params={"pageids": "|".join(map(str, page_ids))})
    return page_response.json()

for result in data["query"]["search"]:    
    print(f"[-] Fetching article {result['title']!r}")
    if result["ns"] != 0:
        print(f"[!] Skipping non-article page {result['title']!r}")
        continue
    page_response = session.get(page_url, params={"curid": result["pageid"]})

    summary = html_to_markdown(result["snippet"])

    # Convert to markdown
    content = f"# {result['title']}\n\n" + html_to_markdown(page_response.text)
    docs.append(
        Document(
            content=content,
            meta={
                "title": result["title"],
                "url": page_response.url,
                "snippet": summary,
                "language": "en",
                "date_added": datetime.now().isoformat(),
            },
        )
    )

print(f"Collected {len(docs)} documents")

[-] Fetching article 'Climate change'
[-] Fetching article 'Climate change denial'
[-] Fetching article 'Climate change mitigation'
[-] Fetching article 'Paris Agreement'
[-] Fetching article 'United Nations Framework Convention on Climate Change'
[-] Fetching article 'Effects of climate change'
[-] Fetching article 'United Nations Climate Change Conference'
[-] Fetching article 'Climate change adaptation'
[-] Fetching article 'Climate change (disambiguation)'
[-] Fetching article 'Intergovernmental Panel on Climate Change'
Collected 10 documents


In [5]:
SUMMARY_PROMPT = """
You are a helpful AI assistant for Klikkikuri service that summarizes text.

Your task is to summarize the given text in the <article> -section. Aim for the following:
- Summary __MUST__ be as non-opinionated as possible, and close to the original text.
- Summary __MUST__ be in the same language as the text.
- Summary __MAY NOT__ contain any additional information, context or commentary in the summary that is not in the text.
- Summary __NEEDS__ to contain the entities and relations of the text.
- Summary __NEEDS__ to contain the most important points of the text.
- Summary __SHOULD__ distill the essential points of the text.
- Summary __SHOULD__ containt the most unique points of the text of the section.
- Summary __SHOULD__ be as short as feasible.
- Summary __MAY ONLY__ include basic semantic formatting in markdown like bold or emphasis, but __MAY NOT__ include formatting like links, images, tables, headings, etc.
- Approach the task as extractive task, but format it as a abstact summary.
- Do __NOT__ add *any* preceding sentences (like "This is a summary of ...", "The section ...") or trailing sentences.
- For sources or references, return `{{SKIP_TAG}}`.
- The output should begin immediately with the summarized content.
- If no sensible summary can be generated, return `{{SKIP_TAG}}`.

You are to summarize following wikipedia article titled {{article_title|escape}}.
Section of the article to summarize is {{section_title|escape}} – keep the summary relevant to it.
Subsectioned content inside of `<summary>` -tags are previously summarized sections for context – focus on providing additional information to them, but do not repeat content that is provided already on them.

Article to summarize follows in markdown format:
<article>
{{text}}
</article>

Now please summarize the text in the <article> -section. 
You MUST only produce the summary, no yapping, no explanations.
"""

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
from haystack import Pipeline, component
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from meri.llm import get_generator
from meri.wp import BetterMarkdownChunker


def reduce_to_summary(md: str, lang="en"):
    """
    Reduce the given markdown text to a summary using the OpenAI LLM.

    Parse the markdown text into a tree structure, and then recursively summarize each node in the tree.
    """

    MAX_CONCURRENT_WORKERS: int = 3
    SKIP_TAG = "<skip>"
    doc_tree = BetterMarkdownChunker(md, language=lang).parse()

    prompt_builder = PromptBuilder(SUMMARY_PROMPT)
    llm = get_generator()

    p = Pipeline(max_runs_per_component=1)
    p.add_component("prompt_builder", prompt_builder)
    p.add_component("llm", llm)
    p.connect("prompt_builder", "llm")

    def make_node_text(node, titles):
        section_titles = titles + [node['title']]
        section_titles_md = "\n\n".join(f"#{'#' * i} {_title}" for i, _title in enumerate(section_titles))
        return f"{section_titles_md}\n\n{node['body']}", section_titles

    @lru_cache(maxsize=128)
    def summarize_node_sync(text, titles) -> str:
        results = p.run({
            "prompt_builder": {
                "text": text,
                "article_title": titles[0],
                "section_title": titles[-1],
                "SKIP_TAG": SKIP_TAG,
            },
        })
        if "llm" in results:
            return results["llm"]["replies"][0]
        raise RuntimeError("No summary generated")

    def generate_summary(node, titles, executor, futures_map):
        print(f" |{'-'*len(titles)} node: {node['title']}")
        new_titles = titles + [node["title"]]
        node_text, full_titles = make_node_text(node, titles)

        if node['children']:
            # Recurse first (sequentially), collect child summaries later
            for child in node['children']:
                generate_summary(child, new_titles, executor, futures_map)

            # After all children are scheduled, combine summaries
            subsection_summaries = []
            for child in node['children']:
                child_summary = child.get("summary")
                subsection_summaries.append(child_summary)

            print(f" >{'-'*len(titles)} {len(subsection_summaries)} summaries for section: {node['title']}")
            sections = [node_text]
            subsection_titles = (f"#{'#' * (len(new_titles) + 1)} {_title}" for _title in titles[1:])
            for title, summary in zip(subsection_titles, subsection_summaries):
                if summary.strip() == SKIP_TAG:
                    logger.debug(f"Skipping summary for {title} due to SKIP_TAG")
                    continue

                sections += [f"{title}\n\n<summary>{summary}</summary>"]
            node_text = "\n\n".join(sections)

        # Submit summarization task
        future = executor.submit(summarize_node_sync, node_text, tuple(new_titles))
        futures_map[future] = node

    with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_WORKERS) as executor:
        futures_map = {}
        generate_summary(doc_tree, [], executor, futures_map)

        for future in as_completed(futures_map):
            node = futures_map[future]
            res = future.result()
            if res.strip() == SKIP_TAG:
                logger.debug(f"Skipping summary for {node['title']} due to SKIP_TAG")
                continue
            node['summary'] = res

    display(doc_tree)

    return doc_tree['summary']

display_markdown(reduce_to_summary(docs[0].content, lang="en"), raw=True)


 | node: Climate change
 |- node: Terminology
 |- node: Global temperature rise
 |-- node: Temperatures prior to present-day global warming
 |-- node: Warming since the Industrial Revolution
 |--- node: Differences by region
 >-- 1 summaries for section: Warming since the Industrial Revolution
 |-- node: Future global temperatures
 >- 3 summaries for section: Global temperature rise
 |- node: Causes of recent global temperature rise
 |-- node: Greenhouse gases
 |-- node: Land surface changes
 |-- node: Other factors
 |--- node: Aerosols and clouds
 |--- node: Solar and volcanic activity
 |--- node: Climate change feedbacks
 >-- 3 summaries for section: Other factors
 >- 3 summaries for section: Causes of recent global temperature rise
 |- node: Modelling
 |- node: Impacts
 |-- node: Environmental effects
 |-- node: Tipping points and long-term impacts
 |-- node: Nature and wildlife
 |-- node: Humans
 |--- node: Health and food
 |--- node: Livelihoods and inequality
 |--- node: Climate 

{'level': 1,
 'title': 'Climate change',
 'summary': "Climate change refers to the human-induced rise in global temperatures, primarily driven by fossil fuel burning, deforestation, and certain agricultural and industrial practices that release greenhouse gases. Earth's average surface air temperature has increased nearly 1.5 °C since the Industrial Revolution, with the Arctic experiencing the most significant warming. The impacts of climate change include expanding deserts, more frequent heat waves and wildfires, thawing permafrost, retreating glaciers, and rising sea levels. These changes threaten human health, food and water security, and can lead to increased flooding, disease, and economic loss. The World Health Organization identifies climate change as a major global health threat. Without action to limit warming, societies and ecosystems face severe risks. The 2015 Paris Agreement aims to keep global warming well under 2 °C, but current pledges may lead to a rise of about 2.8 °C

Climate change refers to the human-induced rise in global temperatures, primarily driven by fossil fuel burning, deforestation, and certain agricultural and industrial practices that release greenhouse gases. Earth's average surface air temperature has increased nearly 1.5 °C since the Industrial Revolution, with the Arctic experiencing the most significant warming. The impacts of climate change include expanding deserts, more frequent heat waves and wildfires, thawing permafrost, retreating glaciers, and rising sea levels. These changes threaten human health, food and water security, and can lead to increased flooding, disease, and economic loss. The World Health Organization identifies climate change as a major global health threat. Without action to limit warming, societies and ecosystems face severe risks. The 2015 Paris Agreement aims to keep global warming well under 2 °C, but current pledges may lead to a rise of about 2.8 °C by century's end. Achieving the 1.5 °C target requires halving emissions by 2030 and reaching net-zero emissions by 2050. Transitioning to renewable energy sources and enhancing carbon capture methods are essential for mitigating climate change.