## Exracting text from web

In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_article_content(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        # Initialize a list to hold the scraped content in order
        article_content_ordered = []
        
        # Extract all relevant tags (headings and paragraphs)
        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
            article_content_ordered.append((tag.name, tag.get_text(strip=True)))
        
        return article_content_ordered
    else:
        return "Failed to retrieve the article."

In [6]:
url = "https://cloud.google.com/blog/products/application-development/rest-vs-rpc-what-problems-are-you-trying-to-solve-with-your-apis" # Replace this with the URL of the article you want to scrape
article_content = scrape_article_content(url)
for tag, text in article_content:
    print(f"{tag}: {text}")

h1: REST vs RPC: What problems are you trying to solve with your APIs?
h5: Martin Nally
p: Software Developer and API designer, Apigee
p: A fairy ring is anaturally occurring circle of mushroomsthat grows in forested areas or grassland. In folklore, fairy rings have magical properties and superstitious people carefully avoid disturbing them. There’s an old joke about the farmer who was asked why he went to such lengths to avoid ploughing up fairy rings. He replied, "because I'd be a fool if I didn't."
p: Many people would say the same thing about why they build APIs. In fact, it is important to think about the fundamental problem you are trying to solve with your API because the style of API you create and the technologies you choose should depend on your answer.
h2: Following procedure
p: Procedures, also called functions, have been the dominant construct for organizing computer code ever since FORTRAN II introduced the concept in 1958. All mainstream modern programming languages that

In [17]:
def filter_content(content, min_paragraph_length=80):
    # Step 1: Filter out short paragraphs
    intermediate_content = [(tag, text) for tag, text in content if not (tag == 'p' and len(text) < min_paragraph_length)]
    
    # Step 2: Remove trailing headings
    # Find the last paragraph to determine where to stop removing headings
    last_paragraph_index = None
    for i in range(len(intermediate_content) - 1, -1, -1):
        if intermediate_content[i][0] == 'p':
            last_paragraph_index = i
            break
    
    # If there's no paragraph at all, return an empty list to indicate no valid content
    if last_paragraph_index is None:
        return []
    
    # Remove trailing headings by slicing the list up to and including the last paragraph
    filtered_content = intermediate_content[:last_paragraph_index + 1]
    
    return filtered_content

In [18]:
filtered_content = filter_content(article_content)

for tag, text in filtered_content:
    print(f"{tag}: {text}")

h1: REST vs RPC: What problems are you trying to solve with your APIs?
h5: Martin Nally
p: A fairy ring is anaturally occurring circle of mushroomsthat grows in forested areas or grassland. In folklore, fairy rings have magical properties and superstitious people carefully avoid disturbing them. There’s an old joke about the farmer who was asked why he went to such lengths to avoid ploughing up fairy rings. He replied, "because I'd be a fool if I didn't."
p: Many people would say the same thing about why they build APIs. In fact, it is important to think about the fundamental problem you are trying to solve with your API because the style of API you create and the technologies you choose should depend on your answer.
h2: Following procedure
p: Procedures, also called functions, have been the dominant construct for organizing computer code ever since FORTRAN II introduced the concept in 1958. All mainstream modern programming languages that are used to produce and consume APIs—for examp

In [27]:
import json

def generate_json_from_content(content):
    root = []  # Root list to hold the entire document structure
    stack = [root]  # Stack to keep track of current nesting level

    for tag, text in content:
        item = {"tag": tag, "text": text, "content": []}

        # Determine the current level based on tag
        if tag.startswith('h'):
            level = int(tag[1])
        else:
            level = float('inf')  # Treat non-headings as the deepest level

        # Pop from stack until we find the parent level
        while len(stack) > level:
            stack.pop()

        # Append the current item to the appropriate level
        if tag.startswith('h'):
            stack[-1].append(item)  # Append to the current level
            stack.append(item["content"])  # Push the new level onto the stack for headings
        else:
            stack[-1].append(item)  # Append paragraphs directly to the current heading content

    return root


In [28]:
document_json = generate_json_from_content(filtered_content)
print(json.dumps(document_json, indent=2))

[
  {
    "tag": "h1",
    "text": "REST vs RPC: What problems are you trying to solve with your APIs?",
    "content": [
      {
        "tag": "h5",
        "text": "Martin Nally",
        "content": [
          {
            "tag": "p",
            "text": "A fairy ring is anaturally occurring circle of mushroomsthat grows in forested areas or grassland. In folklore, fairy rings have magical properties and superstitious people carefully avoid disturbing them. There\u2019s an old joke about the farmer who was asked why he went to such lengths to avoid ploughing up fairy rings. He replied, \"because I'd be a fool if I didn't.\"",
            "content": []
          },
          {
            "tag": "p",
            "text": "Many people would say the same thing about why they build APIs. In fact, it is important to think about the fundamental problem you are trying to solve with your API because the style of API you create and the technologies you choose should depend on your answer.",