In [2]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin

def download_pdfs(url, download_folder):
    # Ensure download folder exists
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    # Fetch the content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links in the page
    links = soup.find_all('a')

    for link in links:
        href = link.get('href')
        if href and href.lower().endswith('.pdf'):
            full_url = urljoin(url, href)
            filename = os.path.join(download_folder, href.split('/')[-1])
            print(f"Downloading {filename}...")
            
            # Download the PDF file
            pdf_response = requests.get(full_url)
            with open(filename, 'wb') as file:
                file.write(pdf_response.content)
            print(f"Saved {filename}")

# Example usage
url = r'https://groups.io/g/Emco-CNC-Users/files?p=name%2C%2C%2C20%2C1%2C0%2C0&jump=1#'
download_folder = 'downloaded_pdfs'
download_pdfs(url, download_folder)


In [10]:
import re

def list_specific_links_from_file():
    file_path = r'C:\Users\Admin\Desktop\EMCO downloads\saved_page.html'
    
    # Read the content of the local HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Define regex for specific links, excluding those ending with '#' or query parameters
    specific_link_regex = re.compile(r'https://groups\.io/g/Emco-CNC-Users/files[^#\s\'",]*\.(pdf|doc|jpg|jpeg|pst)', re.IGNORECASE)
    
    # Find all specific links in the content
    specific_links = specific_link_regex.findall(html_content)

    print(f"Found {len(specific_links)} specific links in the file:")
    for link in specific_links:
        print(link)

# Execute the function
list_specific_links_from_file()


Found 7 specific links in the file:
pdf
pdf
pdf
pdf
pdf
pdf
doc


In [13]:
import re
import requests
from bs4 import BeautifulSoup
import time

def list_links_with_files_from_file():
    file_path = r'C:\Users\Admin\Desktop\EMCO downloads\saved_page.html'
    
    # Read the content of the local HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Define regex for specific links, excluding those ending with '#'
    specific_link_regex = re.compile(r'https://groups\.io/g/Emco-CNC-Users/files[^#\s\'"]*', re.IGNORECASE)
    
    # Find all specific links in the content
    specific_links = specific_link_regex.findall(html_content)
    
    # Define regex for any file links with extensions under the parent URL
    file_link_regex = re.compile(r'https://groups\.io/g/Emco-CNC-Users/files[^\s\'"]*\.[a-zA-Z0-9]{1,5}', re.IGNORECASE)

    print(f"Found {len(specific_links)} specific links in the file:")
    for link in specific_links:
        print(f"Checking {link} for files...")
        try:
            response = requests.get(link)
            if response.status_code == 200:
                link_soup = BeautifulSoup(response.text, 'html.parser')
                all_links = link_soup.find_all('a', href=True)
                for a_tag in all_links:
                    href = a_tag['href']
                    if href.startswith('/'):
                        href = f'https://groups.io{href}'
                    if file_link_regex.search(href):
                        print(href)
            else:
                print(f"Could not access {link} (status code: {response.status_code})")
            time.sleep(2)  # Add delay between requests
        except Exception as e:
            print(f"An error occurred while accessing {link}: {e}")

# Execute the function
list_links_with_files_from_file()


Found 156 specific links in the file:
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/files for files...
Checking https://groups.io/g/Emco-CNC-Users/filessearch for files...
Checking https://groups.io/g/Emco-CNC-Users/filessearch for files...
Checking https://groups.io/g/Emco-CNC-Users/files?p=name,,,20,2,0,0 for files...
Checking https://groups.io/g/Emco-CNC-Users/files?

KeyboardInterrupt: 

In [17]:
import re

def extract_unique_links_from_file():
    file_path = r'C:\Users\Admin\Desktop\EMCO downloads\saved_page.html'
    output_file_path = r'C:\Users\Admin\Desktop\EMCO downloads\filtered_links.txt'
    
    # Read the content of the local HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Define regex to find all links containing the base URL
    all_links_regex = re.compile(r'https://groups\.io/g/Emco-CNC-Users/files[^\s\'"#]*', re.IGNORECASE)
    
    # Find all links in the content
    all_links = all_links_regex.findall(html_content)
    
    # Define patterns to exclude
    exclude_patterns = [
        'https://groups.io/g/Emco-CNC-Users/files',
        'https://groups.io/g/Emco-CNC-Users/files#'
    ]
    
    # Filter out unwanted links and keep unique ones
    unique_links = set(link for link in all_links if link not in exclude_patterns)
    
    print(f"Found {len(unique_links)} unique links in the file:")
    
    # Save the unique links to a text file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for link in unique_links:
            print(link)  # Print the link to the console
            output_file.write(link + '\n')

# Execute the function
extract_unique_links_from_file()


Found 31 unique links in the file:
https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
https://groups.io/g/Emco-CNC-Users/files?p=name,,,20,2,0,0
https://groups.io/g/Emco-CNC-Users/files/8%20Tool%20Turett%20drawings
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20&amp;%20F1%20CNC%20Powersupply
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20Gib%20repair.doc
https://groups.io/g/Emco-CNC-Users/files?p=type,,,20,1,0,0
https://groups.io/g/Emco-CNC-Users/files/20160604090312.pdf
https://groups.io/g/Emco-CNC-Users/files?p=created,,,20,1,0,0
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20CNC_EL.pdf
https://groups.io/g/Emco-CNC-Users/files/Compact%205PC%20Stepper%20Motor%20Driver%20Board
https://groups.io/g/Emco-CNC-Users/files/6%20Position%20Toolplate
https://groups.io/g/Emco-CNC-Users/files?p=name%2C%2C%2C20%2C1%2C20%2C0&amp;jump=1
https://groups.io/g/Emco-CNC-Users/files/50%20&amp;%2055%20mill%20instruct

In [21]:
import re

def extract_filtered_links_from_file(ignore_patterns):
    file_path = r'C:\Users\Admin\Desktop\EMCO downloads\saved_page.html'
    output_file_path = r'C:\Users\Admin\Desktop\EMCO downloads\filtered_links.txt'
    
    # Read the content of the local HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Define regex to find all links containing the base URL
    all_links_regex = re.compile(r'https://groups\.io/g/Emco-CNC-Users/files[^\s\'"#]*', re.IGNORECASE)
    
    # Find all links in the content
    all_links = all_links_regex.findall(html_content)
    
    # Filter out links that contain any of the ignore patterns and keep unique ones
    unique_links = set(link for link in all_links if not any(pattern in link for pattern in ignore_patterns))
    
    print(f"Found {len(unique_links)} unique links in the file:")
    
    # Save the unique links to a text file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for link in unique_links:
            print(link)  # Print the link to the console
            output_file.write(link + '\n')

# Define patterns to ignore
ignore_patterns = [
    'search',
    'p=name',
    'p=type',
    'p=size',
    'p=created',
    'p=updated',
    '#'
]

# Execute the function with the ignore patterns
extract_filtered_links_from_file(ignore_patterns)


Found 21 unique links in the file:
https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20&amp;%20F1%20CNC%20Powersupply
https://groups.io/g/Emco-CNC-Users/files/8%20Tool%20Turett%20drawings
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20Gib%20repair.doc
https://groups.io/g/Emco-CNC-Users/files/20160604090312.pdf
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20CNC_EL.pdf
https://groups.io/g/Emco-CNC-Users/files/Compact%205PC%20Stepper%20Motor%20Driver%20Board
https://groups.io/g/Emco-CNC-Users/files/6%20Position%20Toolplate
https://groups.io/g/Emco-CNC-Users/files/50%20&amp;%2055%20mill%20instructions,%20guides,%20and%20manuals
https://groups.io/g/Emco-CNC-Users/files
https://groups.io/g/Emco-CNC-Users/files/Comp%205%20PC%20Interface%20Board%20Mod
https://groups.io/g/Emco-CNC-Users/files/Compact%205%20PC%20Info
https://groups.io/g/Emco-CNC-Users/files/Com

In [22]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Load filtered links
with open('filtered_links.txt', 'r') as file:
    links = file.read().splitlines()

# Create directory to save files
download_dir = 'downloaded_files'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to download a file
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}. Error: {e}")

# Function to extract files from a URL
def extract_files_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links in the page
        links = [a['href'] for a in soup.find_all('a', href=True)]
        
        # Filter and download files
        file_urls = []
        for link in links:
            if re.search(r'\.[a-zA-Z0-9]+$', link):
                file_url = urljoin(url, link)
                file_urls.append(file_url)
        
        return file_urls
    except Exception as e:
        print(f"Failed to extract files from {url}. Error: {e}")
        return []

# Main process
for link in links:
    print(f"Processing link: {link}")
    files = extract_files_from_url(link)
    
    if not files:  # If no files found, try to check the link itself
        try:
            response = requests.get(link)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            files = [urljoin(link, l) for l in links if re.search(r'\.[a-zA-Z0-9]+$', l)]
        except Exception as e:
            print(f"Failed to process link {link}. Error: {e}")

    # Download files
    for file_url in files:
        filename = os.path.basename(urlparse(file_url).path)
        save_path = os.path.join(download_dir, filename)
        download_file(file_url, save_path)


Processing link: https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Failed to download https://groups.io. Error: [Errno 2] No such file or directory: 'downloaded_files\\'
Downloaded: https://groups.io/facebookstartauth?r=https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Downloaded: https://groups.io/googlestartauth?r=https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Processing link: https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Failed to download https://groups.io. Error: [Errno 2] No such file or directory: 'downloaded_files\\'
Downloaded: https://groups.io/facebookstartauth?r=https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Downloaded: https://groups.io/googlestartauth?r=https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Processing link: https://groups.io/g/Emco-CNC-Users/files/Compact%205%20&amp;%20F1%20CNC%20Powersupply
Failed to download https://groups.io. Error: [Errno 2] No such file or directory: 'downloaded_files\\'
Proc

In [23]:
import re
import os

# Define the path to the saved HTML file
file_path = r'C:\Users\Admin\Desktop\EMCO downloads\saved_page.html'

# Define the output path for filtered links
filtered_links_path = 'filtered_links.txt'

# Define the base URL to match
base_url = 'https://groups.io/g/Emco-CNC-Users/files'

# Define strings to ignore
ignore_strings = ['#', 'search']

# Read and process the HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Extract all links from the HTML content
links = re.findall(r'href=["\'](https://groups.io/g/Emco-CNC-Users/files[^"\']*)["\']', html_content)

# Filter out unwanted links
filtered_links = set()  # Use a set to avoid duplicate links

for link in links:
    if not any(ignored in link for ignored in ignore_strings):
        filtered_links.add(link)

# Save filtered links to a file
with open(filtered_links_path, 'w') as file:
    for link in filtered_links:
        file.write(link + '\n')

print(f"Filtered links have been saved to {filtered_links_path}")


Filtered links have been saved to filtered_links.txt


In [24]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Load filtered links
with open('filtered_links.txt', 'r') as file:
    links = file.read().splitlines()

# Create directory to save files
download_dir = 'downloaded_files'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to download a file
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}. Error: {e}")

# Function to extract files from a URL
def extract_files_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links in the page
        page_links = [a['href'] for a in soup.find_all('a', href=True)]
        
        # Filter and return file URLs
        file_urls = []
        for link in page_links:
            # Ensure the link has a file extension and is valid
            if re.search(r'\.[a-zA-Z0-9]+$', link):
                file_url = urljoin(url, link)
                file_urls.append(file_url)
        
        return file_urls
    except Exception as e:
        print(f"Failed to extract files from {url}. Error: {e}")
        return []

# Main process
for link in links:
    print(f"Processing link: {link}")
    files = extract_files_from_url(link)
    
    if not files:  # If no files found, check if the link itself has files
        try:
            # Extract file URLs directly from the current link
            response = requests.get(link)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            page_links = [a['href'] for a in soup.find_all('a', href=True)]
            files = [urljoin(link, l) for l in page_links if re.search(r'\.[a-zA-Z0-9]+$', l)]
        except Exception as e:
            print(f"Failed to process link {link}. Error: {e}")

    # Download files
    for file_url in files:
        filename = os.path.basename(urlparse(file_url).path)
        if filename:  # Ensure filename is not empty
            save_path = os.path.join(download_dir, filename)
            download_file(file_url, save_path)
        else:
            print(f"Invalid file URL: {file_url}")


Processing link: https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Invalid file URL: https://groups.io
Downloaded: https://groups.io/facebookstartauth?r=https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Downloaded: https://groups.io/googlestartauth?r=https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Processing link: https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Invalid file URL: https://groups.io
Downloaded: https://groups.io/facebookstartauth?r=https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Downloaded: https://groups.io/googlestartauth?r=https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Processing link: https://groups.io/g/Emco-CNC-Users/files?p=name,,,20,2,0,0
Invalid file URL: https://groups.io
Processing link: https://groups.io/g/Emco-CNC-Users/files/8%20Tool%20Turett%20drawings
Invalid file URL: https://groups.io
Processing link: https://groups.io/g/Emco-CNC-Users/files/Compact%205%20&amp;%20F1%20CNC%20Powersupply
Invali

KeyboardInterrupt: 

In [26]:
import os
import time
import requests

# Ensure the download directory exists
if not os.path.exists('downloaded_files'):
    os.makedirs('downloaded_files')

def download_file(url):
    try:
        # Make the request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        
        # Save the file
        file_name = url.split('/')[-1]
        with open(f'downloaded_files/{file_name}', 'wb') as f:
            f.write(response.content)
        
        print(f"Downloaded: {url}")
    
    except requests.HTTPError as e:
        print(f"Failed to download {url}. HTTP Error: {e}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

# List of links to download
links = [
    # Add your links here
]

for link in links:
    download_file(link)
    time.sleep(5)  # Delay between requests to avoid rate limits


In [27]:
from bs4 import BeautifulSoup

# Load the HTML file
with open('C:/Users/Admin/Desktop/EMCO downloads/saved_page.html', 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# Extract all links
links = [a['href'] for a in soup.find_all('a', href=True)]


In [28]:
print(links)

['https://groups.io/', 'https://groups.io/g/Emco-CNC-Users/files#', 'https://groups.io/g/Emco-CNC-Users/topics', 'https://groups.io/groups', 'https://groups.io/search', 'https://groups.io/g/Emco-CNC-Users/files#', 'https://groups.io/helpcenter', 'https://groups.io/g/Emco-CNC-Users/files#', 'https://groups.io/settings', 'https://groups.io/logout', 'https://groups.io/groups', 'https://groups.io/helpcenter', 'https://groups.io/search', 'https://groups.io/settings', 'https://groups.io/logout', 'https://groups.io/g/Emco-CNC-Users', 'https://groups.io/g/Emco-CNC-Users/files#submenu0', 'https://groups.io/g/Emco-CNC-Users/editsub', 'https://groups.io/g/Emco-CNC-Users//profile/@MichaelWood', 'https://groups.io/g/Emco-CNC-Users/subdeliveries', 'https://groups.io/g/Emco-CNC-Users/subintegrations', 'https://groups.io/g/Emco-CNC-Users/files#adminmenu0', 'https://groups.io/g/Emco-CNC-Users/pending', 'https://groups.io/g/Emco-CNC-Users/members', 'https://groups.io/g/Emco-CNC-Users/activity', 'https:/

In [29]:
# Remove links containing '#'
filtered_links = [link for link in links if '#' not in link]

# Now, filtered_links contains only those links that do not have '#'


In [30]:
print(links)

['https://groups.io/', 'https://groups.io/g/Emco-CNC-Users/files#', 'https://groups.io/g/Emco-CNC-Users/topics', 'https://groups.io/groups', 'https://groups.io/search', 'https://groups.io/g/Emco-CNC-Users/files#', 'https://groups.io/helpcenter', 'https://groups.io/g/Emco-CNC-Users/files#', 'https://groups.io/settings', 'https://groups.io/logout', 'https://groups.io/groups', 'https://groups.io/helpcenter', 'https://groups.io/search', 'https://groups.io/settings', 'https://groups.io/logout', 'https://groups.io/g/Emco-CNC-Users', 'https://groups.io/g/Emco-CNC-Users/files#submenu0', 'https://groups.io/g/Emco-CNC-Users/editsub', 'https://groups.io/g/Emco-CNC-Users//profile/@MichaelWood', 'https://groups.io/g/Emco-CNC-Users/subdeliveries', 'https://groups.io/g/Emco-CNC-Users/subintegrations', 'https://groups.io/g/Emco-CNC-Users/files#adminmenu0', 'https://groups.io/g/Emco-CNC-Users/pending', 'https://groups.io/g/Emco-CNC-Users/members', 'https://groups.io/g/Emco-CNC-Users/activity', 'https:/

In [34]:
# Create an empty list to store filtered links
filtered_links = []

# Loop through the original list of links
for link in links:
    if '#' not in link:
        if r'https://groups.io/g/Emco-CNC-Users/files/' in link:
            filtered_links.append(link)

# Output the filtered links
print(filtered_links)

['https://groups.io/g/Emco-CNC-Users/files/20160604090312.pdf', 'https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf', 'https://groups.io/g/Emco-CNC-Users/files/50%20&%2055%20mill%20instructions,%20guides,%20and%20manuals', 'https://groups.io/g/Emco-CNC-Users/files/5PC%20-%20Conversion%20to%20MACH3', 'https://groups.io/g/Emco-CNC-Users/files/6%20Position%20Toolplate', 'https://groups.io/g/Emco-CNC-Users/files/8%20Tool%20Turett%20drawings', 'https://groups.io/g/Emco-CNC-Users/files/ACC_MSD', 'https://groups.io/g/Emco-CNC-Users/files/Automatic%20Vise%203d%20files', 'https://groups.io/g/Emco-CNC-Users/files/C5%20PROGRAMABLE%20SPEED%20CONTROL', 'https://groups.io/g/Emco-CNC-Users/files/Comp%205%20Diag', 'https://groups.io/g/Emco-CNC-Users/files/Comp%205%20PC%20Interface%20Board%20Mod', 'https://groups.io/g/Emco-CNC-Users/files/Compact%205%20CNC_EL.pdf', 'https://groups.io/g/Emco-CNC-Users/files/Compact5CNC_EN_BedienAnl%20%28Operating%20Manual%29.pdf', 'https://groups.io/g/Emco-CNC-

In [35]:
import requests
from urllib.parse import urlparse
import os

# Define a function to download files
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {url}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

# Define the directory to save downloaded files
download_directory = "downloads"
os.makedirs(download_directory, exist_ok=True)

# Filter links with file extensions and download
for link in filtered_links[:]:  # Iterate over a copy of the list
    parsed_url = urlparse(link)
    file_extension = os.path.splitext(parsed_url.path)[1]
    if file_extension:  # Check if there's a file extension
        save_path = os.path.join(download_directory, os.path.basename(parsed_url.path))
        if download_file(link, save_path):
            filtered_links.remove(link)  # Remove the link if downloaded successfully

print("Remaining filtered links:", filtered_links)


Downloaded: https://groups.io/g/Emco-CNC-Users/files/20160604090312.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/20160604090325.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/Compact%205%20CNC_EL.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/Compact5CNC_EN_BedienAnl%20%28Operating%20Manual%29.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/_Compact5_EN.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/Compact5_EN.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/Compact%205%20Gib%20repair.doc
Remaining filtered links: ['https://groups.io/g/Emco-CNC-Users/files/50%20&%2055%20mill%20instructions,%20guides,%20and%20manuals', 'https://groups.io/g/Emco-CNC-Users/files/5PC%20-%20Conversion%20to%20MACH3', 'https://groups.io/g/Emco-CNC-Users/files/6%20Position%20Toolplate', 'https://groups.io/g/Emco-CNC-Users/files/8%20Tool%20Turett%20drawings', 'https://groups.io/g/Emco-CNC-Users/files/ACC_MSD', 'https://groups.io/g/Emco-CNC-Users/files/

In [36]:
import requests
from urllib.parse import urlparse
import os

# Define a function to download files
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {url}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False



# Define the directory to save downloaded files
download_directory = "downloads"
os.makedirs(download_directory, exist_ok=True)

# Filter links with file extensions and download
for link in filtered_links[:]:  # Iterate over a copy of the list
    parsed_url = urlparse(link)
    file_extension = os.path.splitext(parsed_url.path)[1]
    if file_extension:  # Check if there's a file extension
        save_path = os.path.join(download_directory, os.path.basename(parsed_url.path))
        if download_file(link, save_path):
            filtered_links.remove(link)  # Remove the link if downloaded successfully

# Append remaining filtered links to a .log file
log_file_path = "filtered_links.log"
with open(log_file_path, 'a') as log_file:
    for link in filtered_links:
        log_file.write(f"{link}\n")

print(f"Remaining filtered links have been written to {log_file_path}")


Remaining filtered links have been written to filtered_links.log


In [38]:
import os
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# Define a function to download files
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {url}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

# Define a function to extract links from an HTML file
def extract_links_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a', href=True)]
    return links

# Directory to save downloaded files
download_directory = "downloads"
os.makedirs(download_directory, exist_ok=True)

# Process each HTML file in the current working directory
log_file_path = "filtered_links.log"

# Open log file for appending
with open(log_file_path, 'a') as log_file:
    # Iterate over each file in the current working directory
    for file_name in os.listdir('.'):
        if file_name.endswith('.html'):
            print(f"Processing {file_name}...")
            links = extract_links_from_html(file_name)

            # Filter and download files with extensions
            for link in links[:]:  # Iterate over a copy of the list
                if '#' in link:
                    links.remove(link)  # Remove links with #
                else:
                    parsed_url = urlparse(link)
                    file_extension = os.path.splitext(parsed_url.path)[1]
                    if file_extension:  # Check if there's a file extension
                        save_path = os.path.join(download_directory, os.path.basename(parsed_url.path))
                        if download_file(link, save_path):
                            links.remove(link)  # Remove the link if downloaded successfully

            # Append remaining filtered links to the log file
            for link in links:
                log_file.write(f"{link}\n")

print(f"Remaining filtered links have been written to {log_file_path}")


Processing Emco-CNC-Users@groups.io _ Files.html...
Downloaded: https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/pre_reglage_outillage.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/Pratt%2080mm%20chuck%20backplate.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/PCTurn50_EN_Beschr_C.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/PCMill55_EN_Beschr_C.pdf
Downloaded: https://groups.io/g/Emco-CNC-Users/files/PC%20mill%2055%20backup%20EMCO%20TRANSCEND%20FROM%20ACC.zip
Downloaded: https://groups.io/g/Emco-CNC-Users/files/MSD_PLC.zip
Downloaded: https://groups.io/g/Emco-CNC-Users/files/Mk4%20lathe%20test%20TEXT.txt
Downloaded: https://groups.io/g/Emco-CNC-Users/files/MK2,%20MK4,%20F1%20GM%20codes.xls
Downloaded: https://groups.io/g/Emco-CNC-Users/files/MikesFreeInterface_2014_v1.6.4
Processing Emco-CNC-Users@groups.io _ Files1.html...
Downloaded: https://groups.io/g/Emco-CNC-Users/files/20160604090312.pdf
Do

KeyboardInterrupt: 

In [39]:
import os
import time
import requests
from bs4 import BeautifulSoup

# Define a function to download files from the links
def download_files_from_html(html_file_path):
    # Read and parse the HTML file
    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Extract all links from the HTML
    links = [a['href'] for a in soup.find_all('a', href=True) if 'javascript:void(0);' not in a['href']]

    # Filter links to remove those with '#' or without file extensions
    filtered_links = [link for link in links if '#' not in link and any(link.endswith(ext) for ext in ['.pdf', '.doc', '.jpg', '.jpeg', '.pst'])]

    # Directory to save downloaded files
    download_directory = 'downloads'
    if not os.path.exists(download_directory):
        os.makedirs(download_directory)

    # Process each filtered link
    for link in filtered_links:
        attempt = 0
        success = False

        while attempt < 5 and not success:  # Retry up to 5 times
            try:
                # Download the file
                response = requests.get(link, timeout=10)
                response.raise_for_status()  # Raise an exception for HTTP errors

                # Extract file name and save file
                file_name = os.path.join(download_directory, link.split('/')[-1])
                with open(file_name, 'wb') as file:
                    file.write(response.content)
                
                print(f"Downloaded {link}")
                success = True  # Exit the loop if download is successful

            except requests.RequestException as e:
                print(f"Failed to download {link}: {e}")
                attempt += 1
                if attempt < 5:
                    wait_time = 2 ** attempt  # Exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)  # Wait before retrying
                else:
                    print(f"Failed to download {link} after multiple attempts.")

        # Remove the link from the filtered list if download was successful
        if success:
            filtered_links.remove(link)

    # Save remaining links to a .log file
    log_file_path = 'remaining_links.log'
    with open(log_file_path, 'w') as log_file:
        for link in filtered_links:
            log_file.write(f"{link}\n")

# Loop through all HTML files in the current working directory
for file_name in os.listdir('.'):
    if file_name.endswith('.html'):
        download_files_from_html(file_name)


Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: 429 Client Error: Too Many Requests for url: https://groups.io/login?r=https%3A%2F%2Fgroups.io%2Fg%2FEmco-CNC-Users%2Ffiles%2FPS4003.pdf
Retrying in 2 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: 429 Client Error: Too Many Requests for url: https://groups.io/login?r=https%3A%2F%2Fgroups.io%2Fg%2FEmco-CNC-Users%2Ffiles%2FPS4003.pdf
Retrying in 4 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: 429 Client Error: Too Many Requests for url: https://groups.io/login?r=https%3A%2F%2Fgroups.io%2Fg%2FEmco-CNC-Users%2Ffiles%2FPS4003.pdf
Retrying in 8 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: 429 Client Error: Too Many Requests for url: https://groups.io/login?r=https%3A%2F%2Fgroups.io%2Fg%2FEmco-CNC-Users%2Ffiles%2FPS4003.pdf
Retrying in 16 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4

KeyboardInterrupt: 

In [3]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

def download_file(url, folder, delay=1, max_retries=5):
    """
    Download a file from a URL with retries on failure.
    """
    filename = url.split("/")[-1]
    file_path = os.path.join(folder, filename)
    
    # Skip if file already exists
    if os.path.exists(file_path):
        print(f"File {file_path} already exists, skipping download.")
        return True
    
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {url} as {file_path}")
            return True
        except requests.RequestException as e:
            print(f"Failed to download {url}: {e}")
            retries += 1
            time.sleep(delay)
            delay *= 2  # Exponential backoff
    return False

def process_html_file(file_path, output_folder, success_log, failure_log):
    """
    Process an HTML file to extract links, download files, and log results.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')

    links = [a['href'] for a in soup.find_all('a', href=True)]
    
    # Filter links to keep only those with file extensions
    file_links = [urljoin(file_path, link) for link in links if re.search(r'\.\w+$', link)]

    with open(success_log, 'a') as success_file, open(failure_log, 'a') as failure_file:
        for link in file_links:
            success = download_file(link, output_folder)
            if success:
                success_file.write(link + '\n')
            else:
                failure_file.write(link + '\n')

def process_all_html_files(output_folder, success_log, failure_log):
    """
    Process all HTML files in the current working directory.
    """
    for filename in os.listdir('.'):
        if filename.endswith('.html'):
            print(f"Processing {filename}")
            process_html_file(filename, output_folder, success_log, failure_log)

# Define file paths
output_folder = 'downloads'  # Change this to your desired output folder
success_log = 'success.log'
failure_log = 'failure.log'

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process all HTML files
process_all_html_files(output_folder, success_log, failure_log)


Processing Emco-CNC-Users@groups.io _ Files.html
File downloads\PS4003.pdf already exists, skipping download.
File downloads\pre_reglage_outillage.pdf already exists, skipping download.
File downloads\Pratt%2080mm%20chuck%20backplate.pdf already exists, skipping download.
File downloads\PCTurn50_EN_Beschr_C.pdf already exists, skipping download.
File downloads\PCMill55_EN_Beschr_C.pdf already exists, skipping download.
File downloads\PC%20mill%2055%20backup%20EMCO%20TRANSCEND%20FROM%20ACC.zip already exists, skipping download.
File downloads\MSD_PLC.zip already exists, skipping download.
File downloads\Mk4%20lathe%20test%20TEXT.txt already exists, skipping download.
File downloads\MK2,%20MK4,%20F1%20GM%20codes.xls already exists, skipping download.
File downloads\MikesFreeInterface_2014_v1.6.4 already exists, skipping download.
Processing Emco-CNC-Users@groups.io _ Files1.html
File downloads\20160604090312.pdf already exists, skipping download.
File downloads\@Sharp.Shooter already exi

In [None]:
import requests
import os
import time
import re
from bs4 import BeautifulSoup

def download_file(url, output_dir, failed_log_file, initial_delay=5, delay_increment=10):
    # Check if the file already exists
    filename = url.split('/')[-1]
    file_path = os.path.join(output_dir, filename)
    if os.path.exists(file_path):
        print(f"Skipping {url} as it already exists.")
        return

    # Retry logic
    delay = initial_delay
    while True:
        try:
            print(f"Attempting to download {url}...")
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for HTTP errors
            
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"Successfully downloaded {url} to {file_path}.")
            break

        except requests.HTTPError as e:
            if e.response.status_code == 429:  # Too Many Requests
                print(f"Failed to download {url}: {e}")
                print(f"Waiting for {delay} seconds before retrying...")
                time.sleep(delay)
                delay += delay_increment  # Increase delay for next retry
            else:
                print(f"Failed to download {url}: {e}")
                with open(failed_log_file, 'a') as failed_file:
                    failed_file.write(f"{url}\n")
                break

        except Exception as e:
            print(f"An error occurred: {e}")
            with open(failed_log_file, 'a') as failed_file:
                failed_file.write(f"{url}\n")
            break

def process_html_file(html_file_path, output_dir, failed_log_file):
    print(f"Processing HTML file: {html_file_path}")
    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    # Extract and filter links
    links = [a['href'] for a in soup.find_all('a', href=True)]
    links = [link for link in links if not any(ext in link for ext in ['.pdf', '.doc', '.jpg', '.jpeg', '.pst'])]

    print(f"Found {len(links)} links to process.")
    for link in links:
        if '#' in link:
            continue
        if re.search(r'\.(pdf|doc|jpg|jpeg|pst)$', link):
            print(f"File link found: {link}")
            download_file(link, output_dir, failed_log_file)
        else:
            with open(failed_log_file, 'a') as failed_file:
                failed_file.write(f"{link}\n")

def process_all_html_files(directory, output_dir, failed_log_file):
    print(f"Processing all HTML files in directory: {directory}")
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            html_file_path = os.path.join(directory, filename)
            process_html_file(html_file_path, output_dir, failed_log_file)

# Directory paths
current_directory = os.getcwd()
output_directory = os.path.join(current_directory, "downloads")
failed_log_file_path = os.path.join(current_directory, "failed_log.txt")

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Process all HTML files in the current directory
process_all_html_files(current_directory, output_directory, failed_log_file_path)


In [5]:
import requests
import os
import time
import re
from bs4 import BeautifulSoup

def download_file(url, output_dir, failed_log_file, initial_delay=5, delay_increment=10):
    # Check if the file already exists
    filename = url.split('/')[-1]
    file_path = os.path.join(output_dir, filename)
    if os.path.exists(file_path):
        print(f"Skipping {url} as it already exists.")
        return

    # Retry logic
    delay = initial_delay
    while True:
        try:
            print(f"Attempting to download {url}...")
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for HTTP errors
            
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"Successfully downloaded {url} to {file_path}.")
            break

        except requests.HTTPError as e:
            if e.response.status_code == 429:  # Too Many Requests
                print(f"Failed to download {url}: {e}")
                print(f"Waiting for {delay} seconds before retrying...")
                time.sleep(delay)
                delay += delay_increment  # Increase delay for next retry
            else:
                print(f"Failed to download {url}: {e}")
                with open(failed_log_file, 'a') as failed_file:
                    failed_file.write(f"{url}\n")
                break

        except Exception as e:
            print(f"An error occurred: {e}")
            with open(failed_log_file, 'a') as failed_file:
                failed_file.write(f"{url}\n")
            break

def process_html_file(html_file_path, output_dir, failed_log_file):
    print(f"Processing HTML file: {html_file_path}")
    base_name = os.path.splitext(os.path.basename(html_file_path))[0]
    html_output_dir = os.path.join(output_dir, base_name)
    
    # Create subdirectory for this HTML file
    os.makedirs(html_output_dir, exist_ok=True)

    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    # Extract and filter links
    links = [a['href'] for a in soup.find_all('a', href=True)]
    links = [link for link in links if re.search(r'\.(pdf|doc|jpg|jpeg|pst)$', link)]

    print(f"Found {len(links)} links to process.")
    for link in links:
        if '#' in link:
            continue
        if re.search(r'\.(pdf|doc|jpg|jpeg|pst)$', link):
            print(f"File link found: {link}")
            download_file(link, html_output_dir, failed_log_file)
        else:
            with open(failed_log_file, 'a') as failed_file:
                failed_file.write(f"{link}\n")

def process_all_html_files(directory, output_dir, failed_log_file):
    print(f"Processing all HTML files in directory: {directory}")
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            html_file_path = os.path.join(directory, filename)
            process_html_file(html_file_path, output_dir, failed_log_file)

# Directory paths
current_directory = os.getcwd()
output_directory = os.path.join(current_directory, "downloads")
failed_log_file_path = os.path.join(current_directory, "failed_log.txt")

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Process all HTML files in the current directory
process_all_html_files(current_directory, output_directory, failed_log_file_path)


Processing all HTML files in directory: c:\Users\Admin\Desktop\EMCO downloads
Processing HTML file: c:\Users\Admin\Desktop\EMCO downloads\Emco-CNC-Users@groups.io _ Files.html
Found 5 links to process.
File link found: https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf
Skipping https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf as it already exists.
File link found: https://groups.io/g/Emco-CNC-Users/files/pre_reglage_outillage.pdf
Skipping https://groups.io/g/Emco-CNC-Users/files/pre_reglage_outillage.pdf as it already exists.
File link found: https://groups.io/g/Emco-CNC-Users/files/Pratt%2080mm%20chuck%20backplate.pdf
Skipping https://groups.io/g/Emco-CNC-Users/files/Pratt%2080mm%20chuck%20backplate.pdf as it already exists.
File link found: https://groups.io/g/Emco-CNC-Users/files/PCTurn50_EN_Beschr_C.pdf
Skipping https://groups.io/g/Emco-CNC-Users/files/PCTurn50_EN_Beschr_C.pdf as it already exists.
File link found: https://groups.io/g/Emco-CNC-Users/files/PCMill55_EN_Beschr_C.p

In [7]:
import os
import re
import requests
from bs4 import BeautifulSoup
from time import sleep
import random

def extract_links_from_html(html_file):
    """Extract all links with file extensions from the given HTML file."""
    links = []
    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if re.search(r'\.\w+$', href):  # Filter by file extensions
                links.append(href)
    return links

def download_file(url, file_path, failed_log_file):
    """Attempt to download a file, handle errors, and log failures."""
    delay = 1  # Start with a base delay in seconds
    max_delay = 3600  # Maximum delay in seconds

    while True:
        try:
            if os.path.exists(file_path):
                print(f"File already exists: {file_path}")
                break

            response = requests.get(url, stream=True, allow_redirects=True)
            response.raise_for_status()  # Check for HTTP errors

            # Check if the content type is what we expect (optional)
            content_type = response.headers.get('Content-Type')
            if 'application/pdf' not in content_type and 'image/' not in content_type:
                raise ValueError(f"Unexpected Content-Type: {content_type}")

            # Write file
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            # Check if the downloaded file is empty or not
            if os.path.getsize(file_path) < 1024:  # Example threshold of 1KB
                raise ValueError("File is too small, likely incomplete.")

            print(f"Successfully downloaded: {url}")
            break

        except (requests.RequestException, ValueError) as e:
            print(f"Failed to download {url}: {e}")
            with open(failed_log_file, 'a', encoding='utf-8') as log_file:
                log_file.write(f"{url}\n")

            # Increase delay after each failure
            delay = min(delay * 2, max_delay)
            print(f"Retrying in {delay} seconds...")
            sleep(delay)

def process_html_files(directory, failed_log_file):
    """Process each HTML file in the directory to extract and download files."""
    html_files = [f for f in os.listdir(directory) if f.endswith('.html')]

    for html_file in html_files:
        print(f"Processing file: {html_file}")
        links = extract_links_from_html(os.path.join(directory, html_file))
        
        for link in links:
            # Construct full URL if necessary
            if not link.startswith('http'):
                link = f"https://groups.io{link}"
            
            # Construct file path based on HTML file name
            file_name = link.split('/')[-1]
            file_path = os.path.join(directory, html_file.replace('.html', ''), file_name)
            
            # Ensure the directory exists
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            
            # Download the file
            download_file(link, file_path, failed_log_file)

def filter_failed_log(failed_log_file):
    """Filter out links that don't start with the specified prefix."""
    prefix = "https://groups.io/g/Emco-CNC-Users/files"

    # Read the failed log file and filter out invalid links
    with open(failed_log_file, 'r', encoding='utf-8') as file:
        links = file.readlines()

    # Filter links that do not start with the specified prefix
    filtered_links = [link for link in links if link.strip().startswith(prefix)]

    # Write the filtered links back to the file
    with open(failed_log_file, 'w', encoding='utf-8') as file:
        file.writelines(filtered_links)

    print(f"Filtered failed links. Remaining links are written back to {failed_log_file}")

# Main script
current_directory = os.getcwd()
failed_log_file_path = 'failed_log.txt'

# Process HTML files to download files
process_html_files(current_directory, failed_log_file_path)

# Filter the failed log file
filter_failed_log(failed_log_file_path)


Processing file: Emco-CNC-Users@groups.io _ Files.html
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: Unexpected Content-Type: text/html; charset=utf-8
Retrying in 2 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: Unexpected Content-Type: text/html; charset=utf-8
Retrying in 4 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: Unexpected Content-Type: text/html; charset=utf-8
Retrying in 8 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: Unexpected Content-Type: text/html; charset=utf-8
Retrying in 16 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: Unexpected Content-Type: text/html; charset=utf-8
Retrying in 32 seconds...
Failed to download https://groups.io/g/Emco-CNC-Users/files/PS4003.pdf: Unexpected Content-Type: text/html; charset=utf-8
Retrying in 64 seconds...


KeyboardInterrupt: 

In [1]:
import os
from bs4 import BeautifulSoup
import re

def extract_links_from_html(html_file):
    """Extract all links with file extensions from the given HTML file."""
    links = []
    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if re.search(r'\.\w+$', href):  # Filter by file extensions
                links.append(href)
    return links

def process_html_files(directory, output_file):
    """Process each HTML file in the directory to extract and save links."""
    html_files = [f for f in os.listdir(directory) if f.endswith('.html')]
    all_links = []

    for html_file in html_files:
        print(f"Processing file: {html_file}")
        links = extract_links_from_html(os.path.join(directory, html_file))
        all_links.extend(links)

    # Save all links to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        for link in all_links:
            file.write(f"{link}\n")

    print(f"All links have been saved to {output_file}")

# Main script
current_directory = os.getcwd()
output_file_path = 'all_links.txt'

# Process HTML files to extract and save links
process_html_files(current_directory, output_file_path)


Processing file: Emco-CNC-Users@groups.io _ Files.html
Processing file: Emco-CNC-Users@groups.io _ Files2.html
Processing file: Emco-CNC-Users@groups.io _ Files3.html
Processing file: Emco-CNC-Users@groups.io _ Files4.html
Processing file: Emco-CNC-Users@groups.io _ Files5.html
Processing file: index.html
Processing file: saved_page.html
All links have been saved to all_links.txt


In [4]:
import os

def find_all_files_in_dir(base_dir):
    """Recursively find all files in the given directory and its subdirectories."""
    file_paths = set()
    for root, _, files in os.walk(base_dir):
        for file in files:
            file_paths.add(os.path.join(root, file))
    return file_paths

def remove_existing_links(links_file, base_dir):
    """Remove links from the file if corresponding files are found in the directory and its subdirectories."""
    # Find all existing files in the base directory and its subfolders
    all_files = find_all_files_in_dir(base_dir)
    existing_files = {os.path.basename(file) for file in all_files}

    # Read links from the links file
    with open(links_file, 'r') as f:
        links = f.readlines()

    # Check each link and write out those which have no existing file
    remaining_links = []
    for link in links:
        link = link.strip()
        file_name = os.path.basename(link)
        if file_name not in existing_files:
            remaining_links.append(link)
        else:
            print(f"File found for link: {link}, skipping.")

    # Write remaining links to the file
    with open(links_file, 'w') as f:
        for link in remaining_links:
            f.write(link + '\n')
    
    print(f"Processed links file: {links_file}")
    print(f"Remaining links: {len(remaining_links)}")

# Path to the directory to search for files
base_directory = '.'  # You can specify a different directory if needed

# Path to the links file
links_file_path = 'all_links.txt'

# Remove existing links from the links file
remove_existing_links(links_file_path, base_directory)


Processed links file: all_links.txt
Remaining links: 51
