 - https://unstructured.io/?ref=blog.langchain.dev

 - https://blog.langchain.dev/semi-structured-multi-modal-rag/

 - https://arxiv.org/pdf/2411.13773v1

In [None]:
"""
A Presidential election was held



== Nominees ==


=== Republican Party ===

The fight for the Republican nomination...



==== Republican National Convention ====

When the 1952 Republican National Convention opened in Chicago...



=== Democratic Party ===


The expected candidate for the Democratic nomination was the incumbent President Harry S. Truman...



==== Democratic National Convention ====
The 1952 Democratic National Convention was held in Chicago...


== General election ==


=== Campaign issues ===
The Eisenhower campaign was one of the first presidential campaigns

"""

In [None]:
{
    "introduction": "A Presidential election was held",
    "Nominees": {
        "section_text": "",
        "subsections": [
            {
                "Republican Party": {
                    "section_text": "The fight for the Republican nomination..."  ,
                    "subsections": [
                        {
                            "Republican National Convention": {
                                "section_text": "When the 1952 Republican National Convention opened in Chicago...",
                                "subsections": []
                            },
                            
                        }
                    ]
                }   
            },
            {
                "Democratic Party": {
                    "section_text": "The expected candidate for the Democratic nomination was the incumbent President Harry S. Truman...",
                    "subsections": [
                        {
                            "Democratic National Convention": {
                                "section_text": "The 1952 Democratic National Convention was held in Chicago...",
                                "subsections": []
                            }
                        }
                    ]
                },
            }
        ]
    },
    "General election": {
        "section_text": "",
        "subsections": [
            {
                "Campaign issues": {
                    "section_text": "The Eisenhower campaign was one of the first presidential campaigns"  ,
                    "subsections": []
                }
            },
        ]
    }
}

In [1]:
import getpass
import os
import re

In [2]:
from langchain_groq import ChatGroq

In [3]:
os.environ["GROQ_API_KEY"] = getpass.getpass()

 ········


In [4]:
LLAMA_8B = "llama3-8b-8192"
LLAMA_70B = "llama3-70b-8192"
GEMMA2_9B = "gemma2-9b-it"

model = ChatGroq(model=LLAMA_8B)

In [3]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [4]:
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.retrievers import WikipediaRetriever

In [5]:
retriever = WikipediaRetriever(doc_content_chars_max=100000)
loader = WikipediaLoader(query="1952 United States presidential election", load_max_docs=1, doc_content_chars_max=100000)

In [6]:
docs = retriever.invoke("1952 United States presidential election")

In [19]:
raw_text = docs[0].page_content

In [21]:
len(raw_text)

38149

In [13]:
def split_text_by_section_markers(text):
    pattern = r"==\s*[\w\s]+\s*=="
    sections = re.split(pattern, text)
    
    sections = [section.strip() for section in sections if section.strip()]
    return sections

In [22]:
def load_and_split_wikipedia_sections(raw_text):
    section_pattern = r"(?m)^(==+)\s*(.*?)\s*\1$"

    sections = {}
    current_section = "Introduction"
    sections[current_section] = []

    for line in raw_text.split("\n"):
        match = re.match(section_pattern, line)
        if match:
            current_section = match.group(2).strip()
            sections[current_section] = []
        else:
            sections[current_section].append(line.strip())

    sections = {key: " ".join(filter(None, value)) for key, value in sections.items()}
    return sections


In [29]:
def load_and_convert_to_json(raw_text):
    # Regular expression to identify section and subsection headers
    section_pattern = r"(?m)^(==+)\s*(.*?)\s*\1$"

    # Parse text into sections
    sections = []
    current_hierarchy = []  # Keeps track of section nesting
    section_map = {}        # Map section titles to their content and subsections
    section_stack = []      # Track the nested sections as a stack for JSON hierarchy

    for line in raw_text.split("\n"):
        match = re.match(section_pattern, line)
        if match:
            # Get the header level and title
            header_level = len(match.group(1))  # Number of `=` defines level
            section_title = match.group(2).strip()

            # Create a new section entry
            section_entry = {
                "section_text": "",
                "subsections": {}
            }

            # Adjust the current hierarchy based on header level
            while len(current_hierarchy) >= header_level:
                current_hierarchy.pop()
                section_stack.pop()
            
            # Add to the parent section in the stack
            if section_stack:
                parent_section = section_stack[-1]["subsections"]
                parent_section[section_title] = section_entry
            else:
                section_map[section_title] = section_entry

            # Update current hierarchy and stack
            current_hierarchy.append(section_title)
            section_stack.append(section_entry)
        else:
            # Add content to the current section text
            if section_stack:
                section_stack[-1]["section_text"] += line.strip() + " "

    # Combine the top-level sections into a JSON structure
    return section_map


In [36]:
def parse_sections_recursive(text, current_level=2):
    """
    Recursively parse text into sections and subsections.
    Args:
        text (str): The Wikipedia-like content.
        current_level (int): The current header level to process (default is 2 for `==`).
    Returns:
        list: A structured representation of the sections.
    """
    # Regular expression to identify section headers at the current level
    section_pattern = rf"(?m)^(={'=' * current_level})\s*(.*?)\s*\1$"
    matches = list(re.finditer(section_pattern, text))

    if not matches:
        return []

    sections = []
    for i, match in enumerate(matches):
        section_title = match.group(2).strip()
        start_pos = match.end()
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        # Get the section content and process its subsections
        section_content = text[start_pos:end_pos].strip()
        subsections = parse_sections_recursive(section_content, current_level + 1)

        sections.append({
            section_title: {
                "section_text": section_content if not subsections else "",
                "subsections": subsections
            }
        })

    return sections


def wikipedia_to_json(text):
    """
    Converts Wikipedia-like text into a nested JSON structure.
    Args:
        text (str): The Wikipedia-like content.
    Returns:
        dict: The JSON representation of the content.
    """
    # Extract introduction (before the first header)
    section_pattern = r"(?m)^==+\s*.*?\s*==+$"
    match = re.search(section_pattern, text)
    introduction = text[:match.start()].strip() if match else text.strip()

    # Parse main sections
    main_sections = parse_sections_recursive(text)

    return {
        "introduction": introduction,
        **{list(section.keys())[0]: list(section.values())[0] for section in main_sections}
    }

In [50]:
import re
import json

def parse_text_to_json(text):
    """
    Parse the given semi-structured Wikipedia text into a nested JSON structure.
    """
    section_pattern = r"(?m)^(==+)\s*(.*?)\s*\1$"
    matches = list(re.finditer(section_pattern, text))
    
    def create_section_hierarchy(matches, text, level=2):
        """
        Recursively build a section hierarchy based on header levels.
        """
        sections = []
        i = 0

        while i < len(matches):
            match = matches[i]
            current_level = len(match.group(1))  # Number of '=' indicates header level
            title = match.group(2).strip()

            # Ensure we're processing only the headers at the current level
            if current_level != level:
                i += 1
                continue

            # Find the start and end positions of this section
            start_pos = match.end()
            end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)

            # Extract section text and process subsections
            section_text = text[start_pos:end_pos].strip()
            subsections = create_section_hierarchy(matches[i + 1:], text, level + 1)

            # Create the section dictionary
            section = {
                title: {
                    "section_text": section_text,
                    "subsections": subsections
                }
            }
            sections.append(section)

            # Skip processed subsections
            i += 1
            while i < len(matches) and len(matches[i].group(1)) > level:
                i += 1

        return sections

    # Parse the introduction (content before the first header)
    introduction_end = matches[0].start() if matches else len(text)
    introduction = text[:introduction_end].strip()

    # Build the sections hierarchy
    sections = create_section_hierarchy(matches, text)

    # Combine introduction with the main sections
    result = {"introduction": introduction}
    for section in sections:
        result.update(section)

    return result


In [14]:
sections = split_text_by_section_markers(docs[0].page_content[:8000])

In [23]:
sections = load_and_split_wikipedia_sections(raw_text)

In [30]:
sections = load_and_convert_to_json(raw_text)

In [37]:
sections = parse_sections_recursive(raw_text)

In [51]:
sections = parse_text_to_json(raw_text)

In [53]:
print(sections.keys())

dict_keys(['introduction', 'Nominees', 'General election', 'See also', 'Notes', 'References', 'Further reading', 'External links'])


In [56]:
sections["Nominees"]["subsections"][0]

{'Republican Party': {'section_text': "The fight for the Republican nomination was between General Dwight D. Eisenhower, who became the candidate of the party's moderate Eastern Establishment; Senator Robert A. Taft from Ohio, the longtime leader of the party's conservative wing; Governor Earl Warren of California, who appealed to Western delegates and independent voters; and former Governor Harold Stassen of Minnesota, who still had a base of support in the Midwest.\nThe moderate Eastern Republicans were led by New York Governor Thomas E. Dewey, the party's presidential nominee in 1944 and 1948. The moderates tended to be interventionists who felt that the country needed to fight the Cold War overseas and confront the Soviet Union in Eurasia. They were also willing to accept most aspects of the social welfare state created by the New Deal in the 1930s but sought to reform the programs to be more efficient and business-friendly. The moderates were also concerned with ending the Republi

In [32]:
prompt = ChatPromptTemplate.from_template(
    """
    Answer the question on US elections based only on the context provided.
    Context: {context}
    Question: {question}
    """
)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [33]:
question = "Mention the key election campaign issues and the positions of both the parties"

response = chain.invoke(question)

APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama3-8b-8192` in organization `org_01j2xje983ftybdz1qzan5tk9v` on tokens per minute (TPM): Limit 30000, Requested 68259, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [32]:
print(response)

Based on the provided context, the key election campaign issues and positions of both parties in the 2020 and 2024 US presidential elections are as follows:

**2020 Election:**

* Public health and economic impacts of the ongoing COVID-19 pandemic
* Civil unrest in reaction to the police murder of George Floyd and others
* Supreme Court following the death of Ruth Bader Ginsburg and confirmation of Amy Coney Barrett
* Future of the Affordable Care Act

Democratic Party:

* Advocated for healthcare reform, universal child care, paid sick leave, and supporting unions
* Supported liberal internationalism as well as tough stances against China and Russia
* Favored abortion rights, voting rights, LGBT rights, action on climate change, and the legalization of marijuana

Republican Party:

* Opposed liberal internationalism and advocated for a more isolationist foreign policy
* Favored a more conservative economic agenda, including tax cuts and deregulation
* Opposed abortion rights and advoc

In [None]:
"""
    Can you extract information from the provided context in the given format?

    Republican candidate: 
    Democratic candidate: 

    Key issues:
        issue 1:
            republican position:
            democratic position:
        issue 2:
            
    Republican vote percent: 
    Democratic vote percent:

    Republican electoral votes:
    Democratic electoral votes:

    Republican states carried: [<list of states>]
    Democratic states carried: []
    
    Context: {context}
    Question: {question}
"""