In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

# Function to sanitize filenames
def sanitize_filename(filename):
    # Remove query parameters from URLs
    filename = filename.split('?')[0]
    # Replace any other potentially problematic characters
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

def save_file_from_url(url, directory, filename):
    response = requests.get(url)
    response.raise_for_status()
    # Sanitize the filename to avoid file system errors
    safe_filename = sanitize_filename(filename)
    with open(os.path.join(directory, safe_filename), 'wb') as file:
        file.write(response.content)




# Create a directory for the scraped content
directory_name = 'website_content'
if not os.path.exists(directory_name):
    os.makedirs(directory_name)

# URL of the webpage to scrape
url = 'https://www.nyclu.org/en/nypd-traffic-stops-data'

# Send a GET request to the webpage
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content of the page using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Save the main HTML content
with open(os.path.join(directory_name, 'index.html'), 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

# Find and save all linked CSS files
for css_link in soup.find_all("link", rel="stylesheet"):
    css_href = css_link.get('href')
    if css_href:
        css_url = urljoin(url, css_href)
        css_filename = os.path.basename(css_href)
        save_file_from_url(css_url, directory_name, css_filename)

# Find and save all linked JS files
for js_script in soup.find_all("script", src=True):
    js_src = js_script.get('src')
    if js_src:
        js_url = urljoin(url, js_src)
        js_filename = os.path.basename(js_src)
        save_file_from_url(js_url, directory_name, js_filename)

print(f"Content saved in {directory_name} directory")


Content saved in website_content directory


In [None]:
#shell command to update all the files into a site
$counter = 1
Get-ChildItem -Path .\*.css | ForEach-Object {
    $originalName = $_.Name
    $newName = "style$counter.css"
    Rename-Item $originalName -NewName $newName
    (Get-Content index.html) -replace $originalName, $newName | Set-Content index.html
    $counter++
}

$counter = 1
Get-ChildItem -Path .\*.js | ForEach-Object {
    $originalName = $_.Name
    $newName = "script$counter.js"
    Rename-Item $originalName -NewName $newName
    (Get-Content index.html) -replace $originalName, $newName | Set-Content index.html
    $counter++
}
