In [4]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# # Function to fetch HTML from a URL
# def fetch_html_from_url(url):
#     """
#     Fetch the HTML content of a webpage from a given URL.
    
#     :param url: The URL of the webpage to fetch.
#     :return: The raw HTML content as a string.
#     """
#     response = requests.get(url)
#     response.raise_for_status()  # Raise an error for HTTP issues
#     return response.text

def fetch_html_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except Exception as err:
        print(f"Other error occurred: {err}")



# Function to extract specific HTML tags and clean up
def extract_clean_html(html_content, tags_to_extract=None, minimal_css=True):
    soup = BeautifulSoup(html_content, 'html.parser')
    if tags_to_extract is None:
        tags_to_extract = [tag.name for tag in soup.find_all()]
    extracted_content = []
    for tag in tags_to_extract:
        for element in soup.find_all(tag):
            if minimal_css:
                for attr in ['style', 'class', 'id']:
                    if attr in element.attrs:
                        del element.attrs[attr]
            extracted_content.append(element)
    return "\n".join(str(tag) for tag in extracted_content)

# Function to convert HTML to Markdown and save to a file
def convert_html_to_markdown(html_content, output_file="output.md"):
    markdown_content = md(html_content, strip=['a'])
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(markdown_content)
    return markdown_content

# Complete workflow for both file and URL
def process_html(input_source, is_url=False, output_file="output.md", tags_to_extract=None):
    """
    Process HTML content from a file or URL, extract relevant tags, 
    convert to Markdown, and save it.
    
    :param input_source: File path or URL of the HTML content.
    :param is_url: Set to True if input_source is a URL.
    :param output_file: Name of the output Markdown file.
    :param tags_to_extract: List of tags to extract (optional).
    """
    # Step 1: Fetch HTML content
    if is_url:
        raw_html = fetch_html_from_url(input_source)
    else:
        with open(input_source, "r", encoding="utf-8") as file:
            raw_html = file.read()

    # Step 2: Extract and clean HTML
    cleaned_html = extract_clean_html(raw_html, tags_to_extract, minimal_css=True)

    # Step 3: Convert to Markdown and save
    markdown_output = convert_html_to_markdown(cleaned_html, output_file=output_file)
    print(f"Markdown file generated successfully! Check '{output_file}'")
    return markdown_output


In [None]:

# Example usage
if __name__ == "__main__":
    # Use a file
    # process_html("mayank.txt", is_url=False, output_file="mayank_output.md", tags_to_extract=["article", "p", "h1"])

    # process_html("https://nextjs.org/docs", is_url=True, output_file="nextjs_docs.md", tags_to_extract=["article", "p", "h1"])
    # process_html("https://nextjs.org/docs/app/building-your-application/configuring/environment-variables", is_url=True, output_file="nextjs_docs1.md", tags_to_extract=["article", "p", "h1"])
    # process_html("https://medium.com/nybles/inside-my-google-step-internship-fd78d1cdcf55", is_url=True, output_file="medium.md", tags_to_extract=["article", "p", "h1"])
    # process_html("https://python.langchain.com/docs/concepts/chat_models/", is_url=True, output_file="langchain.md", tags_to_extract=["article", "p", "h1"])
    process_html("https://nextjs.org/docs/app/building-your-application/authentication", is_url=True, output_file="authenti.md", tags_to_extract=["article", "p", "h1"])


Markdown file generated successfully! Check 'authenti.md'


: 