In [9]:
%load_ext autoreload
%autoreload 2

from meri.utils import setup_logging
import logging
import mwparserfromhell
from IPython.display import display_markdown

logging.basicConfig(level=logging.DEBUG)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
#import wikipedia
#print(wikipedia.summary("Mathematics"))
#wikipedia.search("Mathematics")
#html = wikipedia.page("Mathematics").url

# # Convert to markdown
# from markdownify import markdownify as md
# from IPython.display import display_markdown
# md_ = md(html)
# display_markdown(md_, raw=True)

In [4]:
import wikipediaapi
from meri.scraper import get_user_agent

wp = wikipediaapi.Wikipedia(get_user_agent(), language='en', extract_format=wikipediaapi.ExtractFormat.HTML)

page = wp.page("Climate_change")

display(page.summary)

'<p class="mw-empty-elt">\n\n</p><p class="mw-empty-elt">\n\n</p>\n\n\n<p>Present-day <b>climate change</b> includes both <b>global warming</b>—the ongoing increase in global average temperature—and its wider effects on Earth\'s climate. Climate change in a broader sense also includes previous long-term changes to Earth\'s climate. The current rise in global temperatures is driven by human activities, especially fossil fuel burning since the Industrial Revolution. Fossil fuel use, deforestation, and some agricultural and industrial practices release greenhouse gases. These gases absorb some of the heat that the Earth radiates after it warms from sunlight, warming the lower atmosphere. Carbon dioxide, the primary greenhouse gas driving global warming, has grown by about 50% and is at levels not seen for millions of years.\n</p><p>Climate change has an increasingly large impact on the environment. Deserts are expanding, while heat waves and wildfires are becoming more common. Amplified w

In [None]:
from datetime import datetime
import requests
import requests.adapters
from meri.extractor._processors import html_to_markdown
from haystack import Document

try:
    import requests_cache
    requests_cache.install_cache("wikipedia_cache")
except ImportError:
    print("No cache available")

# Set request session
session = requests.Session()
retries = requests.adapters.Retry()
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=retries))

search_term = "Climate Change"
search_url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srnamespace=0&format=json"
page_url = "https://en.wikipedia.org/?action=render"
search_response = session.get(search_url, params={
    "srsearch": search_term
})
data = search_response.json()
docs = []

def page_summary(page_ids: list[int]):
    page_url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&redirects=1&exintro=1&explaintext=1"
    page_response = session.get(page_url, params={"pageids": "|".join(map(str, page_ids))})
    return page_response.json()

for result in data["query"]["search"]:    
    print(f"[-] Fetching article {result['title']!r}")
    if result["ns"] != 0:
        print(f"[!] Skipping non-article page {result['title']!r}")
        continue
    page_response = session.get(page_url, params={"curid": result["pageid"]})

    summary = html_to_markdown(result["snippet"])

    # Convert to markdown
    content = f"# {result['title']}\n\n" + html_to_markdown(page_response.text)
    docs.append(
        Document(
            content=content,
            meta={
                "title": result["title"],
                "url": page_response.url,
                "snippet": summary,
                "language": "en",
                "date_added": datetime.now().isoformat(),
            },
        )
    )

print(f"Collected {len(docs)} documents")

[-] Fetching article 'Climate change'
[-] Fetching article 'Climate change denial'
[-] Fetching article 'Climate change mitigation'
[-] Fetching article 'United Nations Framework Convention on Climate Change'
[-] Fetching article 'Effects of climate change'
[-] Fetching article 'Climate change adaptation'
[-] Fetching article 'Intergovernmental Panel on Climate Change'
[-] Fetching article 'United Nations Climate Change Conference'
[-] Fetching article 'Paris Agreement'
[-] Fetching article 'Climate change feedbacks'
Collected 10 documents


In [18]:
from meri.wp import split_markdown_documents

foo = split_markdown_documents(docs[0].content)

display_markdown(foo[0], raw=True)

# Climate change

Human-caused changes to climate on Earth
This article is about the present-day human-induced rise in global temperatures. For natural historical climate trends, see [Climate variability and change](//en.wikipedia.org/wiki/Climate_variability_and_change "Climate variability and change").
"Global warming" redirects here. For other uses, see [Climate change (disambiguation)](//en.wikipedia.org/wiki/Climate_change_(disambiguation) "Climate change (disambiguation)") and [Global warming (disambiguation)](//en.wikipedia.org/wiki/Global_warming_(disambiguation) "Global warming (disambiguation)").







[![The global map shows sea temperature rises of 0.5 to 1 degree Celsius; land temperature rises of 1 to 2 degrees Celsius; and Arctic temperature rises of up to 4 degrees Celsius.](//upload.wikimedia.org/wikipedia/commons/thumb/e/e0/Change_in_Average_Temperature_With_Fahrenheit.svg/300px-Change_in_Average_Temperature_With_Fahrenheit.svg.png)](//en.wikipedia.org/wiki/File:Change_in_Average_Temperature_With_Fahrenheit.svg)

Changes in [surface air temperature](//en.wikipedia.org/wiki/Surface_air_temperature "Surface air temperature") over the past 50 years.[[1]](#cite_note-1) The [Arctic](//en.wikipedia.org/wiki/Arctic "Arctic") has warmed the most, and temperatures on land have generally increased more than [sea surface temperatures](//en.wikipedia.org/wiki/Sea_surface_temperature "Sea surface temperature").


[![](//upload.wikimedia.org/wikipedia/commons/thumb/0/08/Global_Temperature_And_Forces_With_Fahrenheit.svg/300px-Global_Temperature_And_Forces_With_Fahrenheit.svg.png)](//en.wikipedia.org/wiki/File:Global_Temperature_And_Forces_With_Fahrenheit.svg)

Earth's average surface air temperature has increased almost 1.5 °C (about 2.5 °F) since the [Industrial Revolution](//en.wikipedia.org/wiki/Industrial_Revolution "Industrial Revolution"). Natural forces cause some variability, but the 20-year average shows the progressive influence of human activity.[[2]](#cite_note-2)


Present-day **climate change** includes both **global warming**—the ongoing increase in [global average temperature](//en.wikipedia.org/wiki/Global_surface_temperature "Global surface temperature")—and its wider effects on [Earth's climate](//en.wikipedia.org/wiki/Climate_system "Climate system"). [Climate change in a broader sense](//en.wikipedia.org/wiki/Climate_variability_and_change "Climate variability and change") also includes previous long-term changes to Earth's climate. The current rise in global temperatures is [driven by human activities](//en.wikipedia.org/wiki/Scientific_consensus_on_climate_change "Scientific consensus on climate change"), especially [fossil fuel](//en.wikipedia.org/wiki/Fossil_fuel "Fossil fuel") burning since the [Industrial Revolution](//en.wikipedia.org/wiki/Industrial_Revolution "Industrial Revolution").[[3]](#cite_note-3)[[4]](#cite_note-Lynas_2021-4) Fossil fuel use, [deforestation](//en.wikipedia.org/wiki/Deforestation_and_climate_change "Deforestation and climate change"), and some [agricultural](//en.wikipedia.org/wiki/Greenhouse_gas_emissions_from_agriculture "Greenhouse gas emissions from agriculture") and [industrial](//en.wikipedia.org/wiki/Environmental_impact_of_concrete "Environmental impact of concrete") practices release [greenhouse gases](//en.wikipedia.org/wiki/Greenhouse_gas "Greenhouse gas").[[5]](#cite_note-Our_World_in_Data-2020-5) These gases [absorb some of the heat](//en.wikipedia.org/wiki/Greenhouse_effect "Greenhouse effect") that the Earth [radiates](//en.wikipedia.org/wiki/Thermal_radiation "Thermal radiation") after it warms from [sunlight](//en.wikipedia.org/wiki/Sunlight "Sunlight"), warming the lower atmosphere. [Carbon dioxide](//en.wikipedia.org/wiki/Carbon_dioxide "Carbon dioxide"), the primary greenhouse gas driving global warming, [has grown by about 50%](//en.wikipedia.org/wiki/Carbon_dioxide_in_Earth%27s_atmosphere "Carbon dioxide in Earth's atmosphere") and is at levels not seen for millions of years.[[6]](#cite_note-6)


Climate change has an increasingly large [impact on the environment](//en.wikipedia.org/wiki/Effects_of_climate_change "Effects of climate change"). [Deserts are expanding](//en.wikipedia.org/wiki/Desertification "Desertification"), while [heat waves](//en.wikipedia.org/wiki/Heat_wave "Heat wave") and [wildfires](//en.wikipedia.org/wiki/Wildfire#Climate_change_effects "Wildfire") are becoming more common.[[7]](#cite_note-7) [Amplified warming in the Arctic](//en.wikipedia.org/wiki/Polar_amplification "Polar amplification") has contributed to thawing [permafrost](//en.wikipedia.org/wiki/Permafrost "Permafrost"), [retreat of glaciers](//en.wikipedia.org/wiki/Retreat_of_glaciers_since_1850 "Retreat of glaciers since 1850") and [sea ice decline](//en.wikipedia.org/wiki/Arctic_sea_ice_decline "Arctic sea ice decline").[[8]](#cite_note-8) Higher temperatures are also causing [more intense storms](//en.wikipedia.org/wiki/Tropical_cyclones_and_climate_change "Tropical cyclones and climate change"), droughts, and other [weather extremes](//en.wikipedia.org/wiki/Extreme_weather "Extreme weather").[[9]](#cite_note-9) Rapid environmental change in [mountains](//en.wikipedia.org/wiki/Montane_ecosystems "Montane ecosystems"), [coral reefs](//en.wikipedia.org/wiki/Coral_reef "Coral reef"), and [the Arctic](//en.wikipedia.org/wiki/Climate_change_in_the_Arctic "Climate change in the Arctic") is forcing many species to relocate or [become extinct](//en.wikipedia.org/wiki/Extinction_risk_from_climate_change "Extinction risk from climate change").[[10]](#cite_note-10) Even if efforts to minimize future warming are successful, some effects will continue for centuries. These include [ocean heating](//en.wikipedia.org/wiki/Ocean_temperature#Increasing_temperature_due_to_climate_change "Ocean temperature"), [ocean acidification](//en.wikipedia.org/wiki/Ocean_acidification "Ocean acidification") and [sea level rise](//en.wikipedia.org/wiki/Sea_level_rise "Sea level rise").[[11]](#cite_note-11)


Climate change [threatens people](//en.wikipedia.org/wiki/Effects_of_climate_change_on_human_health "Effects of climate change on human health") with increased [flooding](//en.wikipedia.org/wiki/Flooding "Flooding"), extreme heat, increased [food](//en.wikipedia.org/wiki/Effects_of_climate_change_on_agriculture "Effects of climate change on agriculture") and [water](//en.wikipedia.org/wiki/Water_scarcity#Climate_change "Water scarcity") scarcity, more disease, and [economic loss](//en.wikipedia.org/wiki/Economic_impacts_of_climate_change "Economic impacts of climate change"). [Human migration](//en.wikipedia.org/wiki/Environmental_migrant "Environmental migrant") and conflict can also be a result.[[12]](#cite_note-12) The [World Health Organization](//en.wikipedia.org/wiki/World_Health_Organization "World Health Organization") calls climate change one of the biggest threats to [global health](//en.wikipedia.org/wiki/Global_health "Global health") in the 21st century.[[13]](#cite_note-WHO_Nov_2023-13) Societies and ecosystems will experience more severe risks without [action to limit warming](//en.wikipedia.org/wiki/Climate_change_mitigation "Climate change mitigation").[[14]](#cite_note-14) [Adapting to climate change](//en.wikipedia.org/wiki/Climate_change_adaptation "Climate change adaptation") through efforts like [flood control](//en.wikipedia.org/wiki/Flood_control "Flood control") measures or [drought-resistant crops](//en.wikipedia.org/wiki/Xerophyte "Xerophyte") partially reduces climate change risks, although some limits to [adaptation](//en.wikipedia.org/wiki/Climate_change_adaptation "Climate change adaptation") have already been reached.[[15]](#cite_note-15) Poorer communities are responsible for [a small share of global emissions](//en.wikipedia.org/wiki/Climate_justice "Climate justice"), yet have the least ability to adapt and are most [vulnerable to climate change](//en.wikipedia.org/wiki/Climate_change_vulnerability "Climate change vulnerability").[[16]](#cite_note-16)[[17]](#cite_note-17)



[![Bobcat Fire in Monrovia, CA, September 10, 2020](//upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Bobcat_Fire%2C_Los_Angeles%2C_San_Gabriel_Mountains.jpg/298px-Bobcat_Fire%2C_Los_Angeles%2C_San_Gabriel_Mountains.jpg)](//en.wikipedia.org/wiki/File:Bobcat_Fire,_Los_Angeles,_San_Gabriel_Mountains.jpg)[![Bleached colony of Acropora coral](//upload.wikimedia.org/wikipedia/commons/thumb/b/b8/Bleached_colony_of_Acropora_coral.jpg/165px-Bleached_colony_of_Acropora_coral.jpg)](//en.wikipedia.org/wiki/File:Bleached_colony_of_Acropora_coral.jpg)[![A dry lakebed in California, which is experiencing its worst megadrought in 1,200 years.[18]](//upload.wikimedia.org/wikipedia/commons/thumb/d/d3/California_Drought_Dry_Lakebed_2009.jpg/129px-California_Drought_Dry_Lakebed_2009.jpg)](//en.wikipedia.org/wiki/File:California_Drought_Dry_Lakebed_2009.jpg)Examples of some [effects of climate change](//en.wikipedia.org/wiki/Effects_of_climate_change "Effects of climate change"): [Wildfire](//en.wikipedia.org/wiki/Wildfire "Wildfire") intensified by heat and drought, [bleaching of corals](//en.wikipedia.org/wiki/Coral_bleaching "Coral bleaching") occurring more often due to [marine heatwaves](//en.wikipedia.org/wiki/Marine_heatwave "Marine heatwave"), and worsening [droughts](//en.wikipedia.org/wiki/Drought "Drought") compromising water supplies.
Many climate change impacts have been observed in the first decades of the 21st century, with 2023 the warmest on record at +1.48 °C (2.66 °F) since regular tracking began in 1850.[[19]](#cite_note-19)[[20]](#cite_note-20) Additional warming will increase these impacts and can trigger [tipping points](//en.wikipedia.org/wiki/Tipping_points_in_the_climate_system "Tipping points in the climate system"), such as melting all of the [Greenland ice sheet](//en.wikipedia.org/wiki/Greenland_ice_sheet "Greenland ice sheet").[[21]](#cite_note-21) Under the 2015 [Paris Agreement](//en.wikipedia.org/wiki/Paris_Agreement "Paris Agreement"), nations collectively agreed to keep warming "well under 2 °C". However, with pledges made under the Agreement, global warming would still reach about 2.8 °C (5.0 °F) by the end of the century.[[22]](#cite_note-UNEP2024-22) Limiting warming to 1.5 °C would require halving emissions by 2030 and achieving [net-zero](//en.wikipedia.org/wiki/Carbon_neutrality "Carbon neutrality") emissions by 2050.[[23]](#cite_note-23)[[24]](#cite_note-24)


[Fossil fuel use can be phased out](//en.wikipedia.org/wiki/Fossil_fuel_phase-out "Fossil fuel phase-out") by [conserving energy](//en.wikipedia.org/wiki/Conserving_energy "Conserving energy") and switching to energy sources that do not produce significant carbon pollution. These energy sources include [wind](//en.wikipedia.org/wiki/Wind_power "Wind power"), [solar](//en.wikipedia.org/wiki/Solar_power "Solar power"), [hydro](//en.wikipedia.org/wiki/Hydropower "Hydropower"), and [nuclear power](//en.wikipedia.org/wiki/Nuclear_power "Nuclear power").[[25]](#cite_note-25) Cleanly generated electricity can replace fossil fuels for [powering transportation](//en.wikipedia.org/wiki/Electric_vehicles "Electric vehicles"), [heating buildings](//en.wikipedia.org/wiki/Electric_heating "Electric heating"), and running industrial processes.[[26]](#cite_note-26) Carbon can also be [removed from the atmosphere](//en.wikipedia.org/wiki/Carbon_dioxide_removal "Carbon dioxide removal"), for instance by [increasing forest cover](//en.wikipedia.org/wiki/Forest_protection "Forest protection") and farming with methods that [capture carbon in soil](//en.wikipedia.org/wiki/Carbon_farming "Carbon farming").[[27]](#cite_note-27)

In [None]:
# Build pipeline for splitting and indexing
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack import Pipeline

document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="function", splitting_function=split_markdown_documents)
db = InMemoryDocumentStore(embedding_similarity_function="cosine")

pipeline = Pipeline()


AttributeError: 'Pipeline' object has no attribute 'add_node'

In [23]:
print(docs[0].content)

# Climate change

Human\-caused changes to climate on Earth
This article is about the present\-day human\-induced rise in global temperatures. For natural historical climate trends, see [Climate variability and change](//en.wikipedia.org/wiki/Climate_variability_and_change "Climate variability and change").
"Global warming" redirects here. For other uses, see [Climate change (disambiguation)](//en.wikipedia.org/wiki/Climate_change_(disambiguation) "Climate change (disambiguation)") and [Global warming (disambiguation)](//en.wikipedia.org/wiki/Global_warming_(disambiguation) "Global warming (disambiguation)").







[![The global map shows sea temperature rises of 0.5 to 1 degree Celsius; land temperature rises of 1 to 2 degrees Celsius; and Arctic temperature rises of up to 4 degrees Celsius.](//upload.wikimedia.org/wikipedia/commons/thumb/e/e0/Change_in_Average_Temperature_With_Fahrenheit.svg/300px-Change_in_Average_Temperature_With_Fahrenheit.svg.png)](//en.wikipedia.org/wiki/File:C

In [None]:
from IPython.display import display_markdown


wikicode = mwparserfromhell.parse(page.text())


IndexError: list index out of range