In [2]:
import requests
from bs4 import BeautifulSoup

def extract_article_content(url):
    try:
        # Send an HTTP GET request
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise error for HTTP issues
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all <article> tags and extract their text
        articles = soup.find_all('article')
        article_content = "\n".join([article.get_text(strip=True) for article in articles])
        
        # Find all <p> tags and extract their text
        paragraphs = soup.find_all('p')
        paragraph_content = "\n".join([p.get_text(strip=True) for p in paragraphs])
        
        # Combine and return the content
        return article_content + "\n\n" + paragraph_content

    except Exception as e:
        return f"An error occurred: {e}"




In [4]:
import re

def generate_markdown(raw_text, output_file="output.md"):
    # Step 1: Clean the raw text
    cleaned_text = re.sub(r'\s+', ' ', raw_text)  # Normalize whitespace
    cleaned_text = re.sub(r'(?<!\.)\n', ' ', cleaned_text)  # Replace line breaks with spaces
    
    # Step 2: Extract and format sections
    # A simple heuristic to identify headers and content
    cleaned_text = cleaned_text.replace("Good to know:", "\n### Good to know:\n")
    cleaned_text = re.sub(r'(?<!\w)([A-Z][a-zA-Z0-9 ]+):', r'\n## \1\n', cleaned_text)  # Convert headers
    cleaned_text = re.sub(r'([.?!])\s*([A-Z])', r'\1\n\n\2', cleaned_text)  # Split into sentences
    
    # Step 3: Handle bullets and lists
    cleaned_text = re.sub(r'\b([A-Za-z0-9]+[.])\s', r'- \1 ', cleaned_text)  # Turn into bullet points
    
    # Step 4: Extract links (if any) and add to the end of the file
    links = re.findall(r'https?://\S+', raw_text)
    if links:
        links_section = "\n\n## Links\n" + "\n".join(f"- {link}" for link in links)
    else:
        links_section = ""
    
    # Step 5: Write the Markdown content to a file
    markdown_content = cleaned_text + links_section
    with open(output_file, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    return markdown_content



In [3]:
# Example usage
url = "https://nextjs.org/docs/app/getting-started/project-structure"
content = extract_article_content(url)
with open("content.txt", "w", encoding="utf-8") as file:
    file.write(content)

In [6]:
with open("content.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

# Generate Markdown content
markdown_content = generate_markdown(raw_text, output_file="mayank_output.md")

print("Markdown file generated successfully! Check 'mayank_output.md'")

Markdown file generated successfully! Check 'mayank_output.md'


In [1]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# Function to fetch HTML from a URL
def fetch_html_from_url(url):
    """
    Fetch the HTML content of a webpage from a given URL.
    
    :param url: The URL of the webpage to fetch.
    :return: The raw HTML content as a string.
    """
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for HTTP issues
    return response.text

# Function to extract specific HTML tags and clean up
def extract_clean_html(html_content, tags_to_extract=None, minimal_css=True):
    soup = BeautifulSoup(html_content, 'html.parser')
    if tags_to_extract is None:
        tags_to_extract = [tag.name for tag in soup.find_all()]
    extracted_content = []
    for tag in tags_to_extract:
        for element in soup.find_all(tag):
            if minimal_css:
                for attr in ['style', 'class', 'id']:
                    if attr in element.attrs:
                        del element.attrs[attr]
            extracted_content.append(element)
    return "\n".join(str(tag) for tag in extracted_content)

# Function to convert HTML to Markdown and save to a file
def convert_html_to_markdown(html_content, output_file="output.md"):
    markdown_content = md(html_content, strip=['a'])
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(markdown_content)
    return markdown_content

# Complete workflow for both file and URL
def process_html(input_source, is_url=False, output_file="output.md", tags_to_extract=None):
    """
    Process HTML content from a file or URL, extract relevant tags, 
    convert to Markdown, and save it.
    
    :param input_source: File path or URL of the HTML content.
    :param is_url: Set to True if input_source is a URL.
    :param output_file: Name of the output Markdown file.
    :param tags_to_extract: List of tags to extract (optional).
    """
    # Step 1: Fetch HTML content
    if is_url:
        raw_html = fetch_html_from_url(input_source)
    else:
        with open(input_source, "r", encoding="utf-8") as file:
            raw_html = file.read()

    # Step 2: Extract and clean HTML
    cleaned_html = extract_clean_html(raw_html, tags_to_extract, minimal_css=True)

    # Step 3: Convert to Markdown and save
    markdown_output = convert_html_to_markdown(cleaned_html, output_file=output_file)
    print(f"Markdown file generated successfully! Check '{output_file}'")
    return markdown_output

# Example usage
if __name__ == "__main__":
    # Use a file
    # process_html("mayank.txt", is_url=False, output_file="mayank_output.md", tags_to_extract=["article", "p", "h1"])

    # Use a URL
    # process_html("https://nextjs.org/docs", is_url=True, output_file="nextjs_docs.md", tags_to_extract=["article", "p", "h1"])
    # process_html("https://nextjs.org/docs/app/building-your-application/configuring/environment-variables", is_url=True, output_file="nextjs_docs1.md", tags_to_extract=["article", "p", "h1"])
    process_html("https://medium.com/nybles/inside-my-google-step-internship-fd78d1cdcf55", is_url=True, output_file="medium.md", tags_to_extract=["article", "p", "h1"])


HTTPError: 403 Client Error: Forbidden for url: https://medium.com/nybles/inside-my-google-step-internship-fd78d1cdcf55

In [5]:
%pip install selenium

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# Function to fetch HTML content using Selenium (for dynamic pages like Medium)
def fetch_html_with_selenium(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no browser UI)
    chrome_options.add_argument("--disable-gpu")  # Disable GPU for better performance
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    
    # Specify the location of the ChromeDriver
    service = Service(executable_path='/path/to/chromedriver')  # Replace with the path to your ChromeDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Navigate to the URL
    driver.get(url)
    
    # Wait for the page to load completely (you may adjust the sleep time based on the site)
    time.sleep(5)
    
    # Get the page source after JavaScript has been rendered
    html_content = driver.page_source
    
    driver.quit()  # Close the browser
    
    return html_content

# Function to extract specific HTML tags and clean up
def extract_clean_html(html_content, tags_to_extract=None, minimal_css=True):
    soup = BeautifulSoup(html_content, 'html.parser')
    if tags_to_extract is None:
        tags_to_extract = [tag.name for tag in soup.find_all()]
    extracted_content = []
    for tag in tags_to_extract:
        for element in soup.find_all(tag):
            if minimal_css:
                for attr in ['style', 'class', 'id']:
                    if attr in element.attrs:
                        del element.attrs[attr]
            extracted_content.append(element)
    return "\n".join(str(tag) for tag in extracted_content)

# Function to convert HTML to Markdown and save to a file
def convert_html_to_markdown(html_content, output_file="output.md"):
    markdown_content = md(html_content, strip=['a'])
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(markdown_content)
    return markdown_content

# Complete workflow for processing HTML from URL (using Selenium)
def process_html_from_url(url, output_file="output.md", tags_to_extract=None):
    # Step 1: Fetch HTML content using Selenium
    raw_html = fetch_html_with_selenium(url)
    
    # Step 2: Extract and clean HTML content
    cleaned_html = extract_clean_html(raw_html, tags_to_extract, minimal_css=True)
    
    # Step 3: Convert to Markdown and save
    markdown_output = convert_html_to_markdown(cleaned_html, output_file=output_file)
    
    print(f"Markdown file generated successfully! Check '{output_file}'")
    return markdown_output

# Example usage
if __name__ == "__main__":
    url = "https://medium.com/nybles/inside-my-google-step-internship-fd78d1cdcf55"
    process_html_from_url(url, output_file="medium_article.md", tags_to_extract=["article", "p", "h1"])


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [2]:
import requests
from bs4 import BeautifulSoup

# Function to fetch HTML content with custom headers to bypass 403 errors
def fetch_html_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except Exception as err:
        print(f"Other error occurred: {err}")

# Now you can use this function to fetch the HTML content of Medium or any other website.
url = "https://medium.com/nybles/inside-my-google-step-internship-fd78d1cdcf55"
html_content = fetch_html_from_url(url)
if html_content:
    print("Fetched HTML content successfully")
else:
    print("Failed to fetch content")


Fetched HTML content successfully


In [3]:
import requests
from bs4 import BeautifulSoup

# Function to fetch HTML content with custom headers to bypass 403 errors
def fetch_html_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors (like 403)
        return response.text  # Return the raw HTML content
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")  # Print error if HTTP request fails
        return None
    except Exception as err:
        print(f"Other error occurred: {err}")  # Print error if something else fails
        return None

# Function to process the HTML content and extract specified tags
def extract_tags_from_html(html, tags_to_extract):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Initialize a string to store the extracted content
    extracted_content = ""
    
    # Loop through each specified tag and extract its content
    for tag in tags_to_extract:
        for element in soup.find_all(tag):
            extracted_content += f"{element.get_text(strip=True)}\n\n"
    
    return extracted_content

# Function to save the extracted content as a markdown file
def save_to_markdown(content, output_file):
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"Content successfully saved to {output_file}")

# Main function to process URL and extract content
def process_html(input_source, is_url=True, output_file="output.md", tags_to_extract=["article", "p", "h1"]):
    if is_url:
        # Fetch HTML from the URL
        raw_html = fetch_html_from_url(input_source)
        if raw_html:
            extracted_content = extract_tags_from_html(raw_html, tags_to_extract)
            save_to_markdown(extracted_content, output_file)
        else:
            print(f"Failed to retrieve HTML from {input_source}")
    else:
        # Read from file if not a URL
        with open(input_source, "r", encoding="utf-8") as file:
            raw_html = file.read()
            extracted_content = extract_tags_from_html(raw_html, tags_to_extract)
            save_to_markdown(extracted_content, output_file)

# Example usage
if __name__ == "__main__":
    url = "https://medium.com/nybles/inside-my-google-step-internship-fd78d1cdcf55"  # Replace with your URL
    process_html(url, is_url=True, output_file="medium_content.md", tags_to_extract=["article", "p", "h1"])


Content successfully saved to medium_content.md
