In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# List of URLs to download
urls = [
    "https://www.gernot-schaly.de/40882.html",
    "https://www.gernot-schaly.de/40564.html",
    "https://www.gernot-schaly.de/252001.html",
    "https://www.gernot-schaly.de/41247.html",
    "https://www.gernot-schaly.de/39994.html",
    "https://www.gernot-schaly.de/40555.html",
    "https://www.gernot-schaly.de/40903.html",
    "https://www.gernot-schaly.de/40962.html",
    "https://www.gernot-schaly.de/137901.html",
    "https://www.gernot-schaly.de/42217.html",
    "https://www.gernot-schaly.de/300267.html",
    "https://www.gernot-schaly.de/40864/40863.html",
    "https://www.gernot-schaly.de/40864/home.html",
]

# Function to create necessary folders and download the content
def download_page_content(url):
    # Check if the URL has the jump parameter
    if "jump=" in url:
        # Extract jump value to create corresponding folder (e.g., jump0, jump1, etc.)
        jump_value = url.split("jump=")[1].split("&")[0]
        folder_name = f"jump{jump_value}"
    else:
        # For URLs without the jump parameter, handle them based on specific logic
        if "/40864/40863.html" in url:
            folder_name = "40863"
        elif "/40864/home.html" in url:
            folder_name = "home"
        else:
            folder_name = url.split("/")[-1].replace(".html", "").replace("/home", "")

    # Create necessary folder structure
    base_folder = os.path.join("website_backup", folder_name)
    content_folder = os.path.join(base_folder, "page_content")
    frontend_images_folder = os.path.join(base_folder, "frontend_images")
    high_resolution_images_folder = os.path.join(base_folder, "high_resolution_images")

    # Create directories
    os.makedirs(content_folder, exist_ok=True)
    os.makedirs(frontend_images_folder, exist_ok=True)
    os.makedirs(high_resolution_images_folder, exist_ok=True)

    # Fetch the HTML content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the HTML content
    html_path = os.path.join(content_folder, "content.html")
    with open(html_path, "w", encoding="utf-8") as file:
        file.write(soup.prettify())
    print(f"Saved HTML content for {folder_name}")

    # Function to download images
    def download_image(img_url, folder):
        img_data = requests.get(img_url).content
        img_name = img_url.split("/")[-1]
        with open(os.path.join(folder, img_name), 'wb') as img_file:
            img_file.write(img_data)
        print(f"Downloaded image: {img_name} to {folder}")

    # Download front-end images
    images = soup.find_all("img")
    for img in images:
        img_src = img.get("src")
        if img_src:
            img_url = urljoin(url, img_src)
            download_image(img_url, frontend_images_folder)

    # Download high-resolution images
    for link in soup.find_all("a", href=True):
        img_url = link["href"]
        if img_url.endswith((".jpg", ".jpeg", ".png", ".gif")):
            full_img_url = urljoin(url, img_url)
            download_image(full_img_url, high_resolution_images_folder)

    print(f"Downloaded all content for {folder_name}")

# Iterate over each URL and download its content
for url in urls:
    download_page_content(url)

print("All pages downloaded and organized!")


WebDriverException: Message: Service /home/schaly/.wdm/drivers/chromedriver/linux64/114.0.5735.90/chromedriver unexpectedly exited. Status code was: 127


In [47]:
import os

# Create folder structure for jump0 to jump12
for i in range(13):
    # Define the path for each folder
    folder_path = f"jump{i}/page_content"
    
    # Create the folder structure if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    
    # Define the path for the HTML file
    file_path = os.path.join(folder_path, "content.html")
    
    # Create an empty HTML file (you can modify this part if you want to add content)
    with open(file_path, "w") as file:
        file.write("<html><body><h1>Placeholder content</h1></body></html>")

print("Folder structure and files created successfully.")


Folder structure and files created successfully.


In [29]:
import os
from bs4 import BeautifulSoup

# Base folder containing the downloaded content
base_download_folder = "website_backup"

def update_image_paths(folder_name):
    # Paths for each folder
    content_folder = os.path.join(base_download_folder, folder_name, "page_content")
    frontend_images_folder = os.path.join("..", "frontend_images")  # Relative path to frontend_images folder
    html_path = os.path.join(content_folder, "content.html")

    # Check if HTML file exists
    if not os.path.exists(html_path):
        print(f"HTML content for {folder_name} not found, skipping...")
        return
    
    # Load HTML content
    with open(html_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    
    # Update image paths
    images = soup.find_all("img")
    for img in images:
        img_src = img.get("src")
        if img_src:
            # Remove query parameters from image src
            img_src_cleaned = img_src.split("?")[0]  # Remove anything after "?"
            
            # Extract image name and update the path to point to the local frontend_images folder
            img_name = os.path.basename(img_src_cleaned)  # Get the image file name without path
            new_img_path = os.path.join(frontend_images_folder, img_name)  # Use local relative path
            img["src"] = new_img_path
            
            print(f"Updated image source for {img_name} to {new_img_path}")

    # Save updated HTML
    with open(html_path, "w", encoding="utf-8") as file:
        file.write(soup.prettify())
    print(f"Updated HTML content for {folder_name}")

# Loop through each folder in the base download directory
for folder_name in os.listdir(base_download_folder):
    update_image_paths(folder_name)

print("All image paths updated in HTML files!")

Updated image source for Slice_01.gif to ../frontend_images/Slice_01.gif
Updated image source for Slice_02.gif to ../frontend_images/Slice_02.gif
Updated image source for logo.gif to ../frontend_images/logo.gif
Updated image source for Slice_04.gif to ../frontend_images/Slice_04.gif
Updated image source for kv_9924.jpg to ../frontend_images/kv_9924.jpg
Updated image source for Slice_06.gif to ../frontend_images/Slice_06.gif
Updated image source for company_name.gif to ../frontend_images/company_name.gif
Updated image source for Slice_08.gif to ../frontend_images/Slice_08.gif
Updated image source for Slice_09.gif to ../frontend_images/Slice_09.gif
Updated image source for spacer.gif to ../frontend_images/spacer.gif
Updated image source for 40882_n.gif to ../frontend_images/40882_n.gif
Updated image source for 40564_n.gif to ../frontend_images/40564_n.gif
Updated image source for 252001_n.gif to ../frontend_images/252001_n.gif
Updated image source for 41247_n.gif to ../frontend_images/41

In [30]:
import os
import re

# Base directory containing each folder with images
base_download_folder = "website_backup"

# Regex pattern to match files with query parameters
query_param_pattern = re.compile(r"(.*?)(\?.*)$")

def clean_image_filenames(folder_path):
    for filename in os.listdir(folder_path):
        # Check if the filename matches the pattern with a query parameter
        if query_param_pattern.match(filename):
            # Extract base filename without the query parameter
            clean_filename = query_param_pattern.sub(r"\1", filename)
            
            # Full paths for old and new filenames
            old_file_path = os.path.join(folder_path, filename)
            new_file_path = os.path.join(folder_path, clean_filename)
            
            # Rename file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed '{filename}' to '{clean_filename}'")
        else:
            print(f"No query parameter found in '{filename}', skipping...")

# Loop through each folder in website_backup to find frontend_images
for folder_name in os.listdir(base_download_folder):
    frontend_images_folder = os.path.join(base_download_folder, folder_name, "frontend_images")
    
    # Check if frontend_images folder exists
    if os.path.isdir(frontend_images_folder):
        print(f"Processing images in '{frontend_images_folder}'...")
        clean_image_filenames(frontend_images_folder)
    else:
        print(f"No frontend_images folder found in '{folder_name}', skipping...")

print("All files renamed!")

Processing images in 'website_backup/40903/frontend_images'...
No query parameter found in 'b977b37e8ea8b3f1ffff80b6ffffffef.jpg', skipping...
No query parameter found in 'c5d1f0551ef3fa97ffff8009fffffff1.jpg', skipping...
No query parameter found in 'b852edbe6e53a221ffff814cfffffff2.jpg', skipping...
No query parameter found in '4521f7e0ad5d280fffff820efffffff1.JPG', skipping...
No query parameter found in 'Slice_15.gif', skipping...
No query parameter found in '4f6016af8f472992ffff8161ffffffef.JPG', skipping...
No query parameter found in '81a4932e27b51ee9ffff80bdfffffff0.JPG', skipping...
No query parameter found in '300267_n.gif', skipping...
No query parameter found in 'e40f978cf7997ceffff8051fffffff0.jpg', skipping...
No query parameter found in '81a4932e27b51ee9ffff80befffffff0.JPG', skipping...
No query parameter found in '47fdb2ecaf3f11eeffff8049ffffffef.jpg', skipping...
No query parameter found in '7196839c8ae6e6f4ffff80bcac144227.JPG', skipping...
No query parameter found i

In [31]:
import os
from bs4 import BeautifulSoup

# Base folder containing the downloaded content
base_download_folder = "website_backup"

def update_href_paths(folder_name):    
    # Paths for each folder
    content_folder = os.path.join(base_download_folder, folder_name, "page_content")
    html_path = os.path.join(content_folder, "content.html")
    
    # Check if HTML file exists
    if not os.path.exists(html_path):
        print(f"HTML content for {folder_name} not found, skipping...")
        return
    
    # Load HTML content
    with open(html_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    
    # Update href references
    links = soup.find_all("a", href=True)
    for link in links:
        old_href = link["href"]
        # Check if href starts with './' and replace it with the new format
        if old_href.startswith("./"):
            # Extract the number from the old format (e.g., "307667.html" becomes "307667")
            page_number = old_href[2:-5]  # Removing './' and '.html'
            new_href = f"../../{page_number}/page_content/content.html"
            link["href"] = new_href
            print(f"Updated href from '{old_href}' to '{new_href}'")

    # Save updated HTML
    with open(html_path, "w", encoding="utf-8") as file:
        file.write(soup.prettify())
    print(f"Updated HTML content for {folder_name}")

# Loop through each folder in the base download directory
for folder_name in os.listdir(base_download_folder):
    update_href_paths(folder_name)

print("All href paths updated in HTML files!")


Updated HTML content for 40903
Updated HTML content for home
Updated HTML content for 300267
Updated HTML content for 42217
Updated HTML content for 307667
Updated HTML content for 40564
Updated HTML content for 40962
Updated HTML content for 41247
Updated HTML content for 40882
Updated href from './../40882.html' to '../../../40882/page_content/content.html'
Updated href from './../40564.html' to '../../../40564/page_content/content.html'
Updated href from './../252001.html' to '../../../252001/page_content/content.html'
Updated href from './../41247.html' to '../../../41247/page_content/content.html'
Updated href from './../39994.html' to '../../../39994/page_content/content.html'
Updated href from './../40555.html' to '../../../40555/page_content/content.html'
Updated href from './../40903.html' to '../../../40903/page_content/content.html'
Updated href from './../40962.html' to '../../../40962/page_content/content.html'
Updated href from './../137901.html' to '../../../137901/page_

In [32]:
import os
from bs4 import BeautifulSoup

# Base folder containing the downloaded content
base_download_folder = "website_backup"

def update_href_paths(folder_name):
    # Paths for each folder
    content_folder = os.path.join(base_download_folder, folder_name, "page_content")
    html_path = os.path.join(content_folder, "content.html")
    
    # Check if HTML file exists
    if not os.path.exists(html_path):
        print(f"HTML content for {folder_name} not found, skipping...")
        return
    
    # Load HTML content
    with open(html_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    
    # Update href references
    links = soup.find_all("a", href=True)
    for link in links:
        old_href = link["href"]
        # Replace ../../../ with ../../
        if "../../../" in old_href:
            new_href = old_href.replace("../../../", "../../")
            link["href"] = new_href
            print(f"Updated href from '{old_href}' to '{new_href}'")

    # Save updated HTML
    with open(html_path, "w", encoding="utf-8") as file:
        file.write(soup.prettify())
    print(f"Updated HTML content for {folder_name}")

# Loop through each folder in the base download directory
for folder_name in os.listdir(base_download_folder):
    update_href_paths(folder_name)

print("All href paths updated in HTML files!")


Updated HTML content for 40903
Updated HTML content for home
Updated HTML content for 300267
Updated HTML content for 42217
Updated HTML content for 307667
Updated HTML content for 40564
Updated HTML content for 40962
Updated HTML content for 41247
Updated HTML content for 40882
Updated href from '../../../40882/page_content/content.html' to '../../40882/page_content/content.html'
Updated href from '../../../40564/page_content/content.html' to '../../40564/page_content/content.html'
Updated href from '../../../252001/page_content/content.html' to '../../252001/page_content/content.html'
Updated href from '../../../41247/page_content/content.html' to '../../41247/page_content/content.html'
Updated href from '../../../39994/page_content/content.html' to '../../39994/page_content/content.html'
Updated href from '../../../40555/page_content/content.html' to '../../40555/page_content/content.html'
Updated href from '../../../40903/page_content/content.html' to '../../40903/page_content/cont

In [33]:
import os
from bs4 import BeautifulSoup

# Base folder containing the downloaded content
base_download_folder = "website_backup"

def update_href_paths(folder_name):
    # Paths for each folder
    content_folder = os.path.join(base_download_folder, folder_name, "page_content")
    html_path = os.path.join(content_folder, "content.html")
    
    # Check if HTML file exists
    if not os.path.exists(html_path):
        print(f"HTML content for {folder_name} not found, skipping...")
        return
    
    # Load HTML content
    with open(html_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    
    # Update href references
    links = soup.find_all("a", href=True)
    for link in links:
        old_href = link["href"]
        
        # Replace ../../40864/home/page_content/content.html with ../../home/page_content/content.html
        if "../../40864/home/page_content/content.html" in old_href:
            new_href = old_href.replace("../../40864/home/page_content/content.html", "../../home/page_content/content.html")
            link["href"] = new_href
            print(f"Updated href from '{old_href}' to '{new_href}'")
        
        # Replace ../../40864/40863/page_content/content.html with ../../40863/page_content/content.html
        if "../../40864/40863/page_content/content.html" in old_href:
            new_href = old_href.replace("../../40864/40863/page_content/content.html", "../../40863/page_content/content.html")
            link["href"] = new_href
            print(f"Updated href from '{old_href}' to '{new_href}'")

    # Save updated HTML
    with open(html_path, "w", encoding="utf-8") as file:
        file.write(soup.prettify())
    print(f"Updated HTML content for {folder_name}")

# Loop through each folder in the base download directory
for folder_name in os.listdir(base_download_folder):
    update_href_paths(folder_name)

print("All href paths updated in HTML files!")

Updated HTML content for 40903
Updated HTML content for home
Updated HTML content for 300267
Updated HTML content for 42217
Updated HTML content for 307667
Updated HTML content for 40564
Updated HTML content for 40962
Updated HTML content for 41247
Updated HTML content for 40882
Updated href from '../../40864/home/page_content/content.html' to '../../home/page_content/content.html'
Updated HTML content for 40863
Updated HTML content for 252001
Updated HTML content for 40555
Updated HTML content for 137901
Updated HTML content for 39994
All href paths updated in HTML files!
