# Newsletter import

## 1.) Access Newsletter

In [1]:
import requests
from bs4 import BeautifulSoup #lol
import os
import re
import pypdfium2 as pdfium
from generate_link_lists import read_yaml_file

In [2]:
# URL of the newsletter archive
url = "https://nfdi4bioimage.de/?id=157062#c896618"

# Send a GET request to fetch the webpage
response = requests.get(url)
response.raise_for_status()  # Raise an exception for HTTP errors

# Parse the webpage using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links that point to PDF files
pdf_links = []
for a_tag in soup.find_all('a', href=True):
    href = a_tag['href']
    if href.endswith('.pdf'):
        # If the link is relative, make it absolute
        if not href.startswith('http'):
            href = requests.compat.urljoin(url, href)
        pdf_links.append(href)

# Print the extracted PDF links
print("Found PDF links:")
for pdf_link in pdf_links:
    print(pdf_link)

Found PDF links:
https://nfdi4bioimage.de/securedl/sdl-eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MzgwNTc3MzUsImV4cCI6MTczODc0ODkzNSwidXNlciI6MCwiZ3JvdXBzIjpbMCwtMV0sImZpbGUiOiJmaWxlYWRtaW4vYmlvbG9naWUvYmlvaW1hZ2luZy9uZmRpNGJpb2ltYWdlL0RhdGVpZW5fenVtX0Rvd25sb2FkLzIwMjQxMjE5X05GREk0QklPSU1BR0VfTmV3c2xldHRlcl8tX0RlY2VtYmVyXzE5dGhfMjAyNC5wZGYiLCJwYWdlIjoxNTcwNjJ9.Xx04vPOS2c0JnZEC9jwnLTFAPDT5VG2WnGstwij01D8/20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf
https://nfdi4bioimage.de/securedl/sdl-eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MzgwNTc3MzUsImV4cCI6MTczODc0ODkzNSwidXNlciI6MCwiZ3JvdXBzIjpbMCwtMV0sImZpbGUiOiJmaWxlYWRtaW4vYmlvbG9naWUvYmlvaW1hZ2luZy9uZmRpNGJpb2ltYWdlL0RhdGVpZW5fenVtX0Rvd25sb2FkLzIwMjQwOTEwX05GREk0QklPSU1BR0VfTmV3c2xldHRlcl8tX1NlcHRlbWJlcl8xMHRoXzIwMjQucGRmIiwicGFnZSI6MTU3MDYyfQ.q1o4tqAVhWhaHSTjJa3YQL25c9n1B0hzWBPvu8lVLbU/20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf
https://nfdi4bioimage.de/securedl/sdl-eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1

In [3]:
# Directory to save downloaded PDFs
newsletter_dir = "newsletters"
os.makedirs(newsletter_dir, exist_ok=True)

# Download each PDF file
for pdf_link in pdf_links:
    pdf_name = pdf_link.split('/')[-1]  # Extract the file name from the URL
    pdf_path = os.path.join(newsletter_dir, pdf_name)

    # Download and save the PDF
    print(f"Downloading {pdf_name}...")
    pdf_response = requests.get(pdf_link)
    pdf_response.raise_for_status()  # Raise an exception for HTTP errors

    with open(pdf_path, 'wb') as pdf_file:
        pdf_file.write(pdf_response.content)

    print(f"Saved to {pdf_path}")

print("All PDFs downloaded!")

Downloading 20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf...
Saved to newsletters/20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf
Downloading 20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf...
Saved to newsletters/20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf
Downloading 20240617_NFDI4BIOIMAGE_Newsletter_-_June_17th_2024.pdf...
Saved to newsletters/20240617_NFDI4BIOIMAGE_Newsletter_-_June_17th_2024.pdf
Downloading 20240301_NFDI4BIOIMAGE_Anniversary_Newsletter_-_March_1st_2024.pdf...
Saved to newsletters/20240301_NFDI4BIOIMAGE_Anniversary_Newsletter_-_March_1st_2024.pdf
Downloading 2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf...
Saved to newsletters/2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf
Downloading 2023-09-08_NFDI4BIOIMAGE_Newsletter.pdf...
Saved to newsletters/2023-09-08_NFDI4BIOIMAGE_Newsletter.pdf
Downloading 2023-06-06_NFDI4BIOIMAGE_Newsletter.pdf...
Saved to newsletters/2023-06-06_NFDI4BIOIMAGE_Newsletter.pdf
Downloading 2023-03-13_NFD

In [4]:
# Directory containing the downloaded PDFs
pdf_dir = "newsletters"

# Regex pattern to match URLs
url_pattern = r"https?://[^\s<>\"']+"

# Dictionary to store URLs grouped by their respective PDF
newsletter_links = {}

# Iterate over all PDF files in the directory
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):  # Only process PDF files
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"Extracting URLs from: {pdf_file}")

        try:
            # Open the PDF file
            pdf = pdfium.PdfDocument(pdf_path)

            # List to store links for the current PDF
            pdf_links = []

            # Loop through all pages
            for page_number in range(len(pdf)):
                page = pdf[page_number]
                text = page.get_textpage().get_text_range()  # Extract text from the page

                # Find all URLs in the text using regex
                urls = re.findall(url_pattern, text)
                pdf_links.extend(urls)  # Add found URLs to the PDF's list

            # Remove duplicates for this PDF and add to the dictionary
            newsletter_links[pdf_file] = list(set(pdf_links))

        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

# Remove duplicates across all PDFs
all_links = set()
for links in newsletter_links.values():
    all_links.update(links)

# Print all extracted URLs grouped by PDF
print("\nExtracted URLs:")
for pdf_file, links in newsletter_links.items():
    print(f"\n{pdf_file}:")
    for link in links:
        print(f"- {link}")

# Save the grouped links to a .txt file
output_file = "newsletters/newsletter_links.txt"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

with open(output_file, "w") as f:
    f.write("Extracted Newsletter Links\n")
    f.write("=" * 25 + "\n\n")
    for pdf_file, links in newsletter_links.items():
        f.write(f"Newsletter: {pdf_file}\n")
        f.write("-" * 20 + "\n")
        for link in links:
            f.write(f"- {link}\n")
        f.write("\n")

print(f"\nGrouped newsletter links saved to {output_file}.")

Extracting URLs from: 2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 2022-11-04_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 2023-09-08_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf
Extracting URLs from: 2022-06-28_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 20240301_NFDI4BIOIMAGE_Anniversary_Newsletter_-_March_1st_2024.pdf
Extracting URLs from: 20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf
Extracting URLs from: 2023-06-06_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 2023-03-13_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 20240617_NFDI4BIOIMAGE_Newsletter_-_June_17th_2024.pdf

Extracted URLs:

2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf:
- https://doi.org/10.5281/zenodo.10008465
- https://zenodo.org/communities/nfdi4biodiv?q=&l=list&p=1&s=10&sort=newest
- https://doi.org/10.5281/zenodo.8349563
- https://doi.org/10.52825/cordi.v1i.417
- https://www.focusonmicroscopy.org/
- h



In [5]:
newsletter_links

{'2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf': ['https://doi.org/10.5281/zenodo.10008465',
  'https://zenodo.org/communities/nfdi4biodiv?q=&l=list&p=1&s=10&sort=newest',
  'https://doi.org/10.5281/zenodo.8349563',
  'https://doi.org/10.52825/cordi.v1i.417',
  'https://www.focusonmicroscopy.org/',
  'https://doi.org/10.52825/cordi.v1i.285',
  'https://doi.org/10.5281/zenodo.8323588',
  'https://www.embl.org/jobs/position/BIOHUB00006',
  'https://doi.org/10.1101/2023.05.19.541329',
  'https://www.youtube.com/playlist?list=PL2k-L-zWPoR7SHjG1HhDIwLZj0MB_stlU',
  'https://developmental.cellatlas.io/webatlas',
  'https://www.elmi2024.org/',
  'https://www.ebi.ac.uk/bioimage-archive/galleries/galleries.html',
  'https://gerbi-gmb.de/event/gerbi-annual-community-meeting\x022024/',
  'https://www.ebi.ac.uk/training/events/microscopy-data-analysis-machine-learning\x02and-bioimage-archive/',
  'https://doi.org/10.1038/d41586-023-03064-9',
  'https://github.com/NFDI4BIOIMAGE/BHG2023-OMERO-ARC',
  'ht

In [6]:
yaml_file = "../resources/nfdi4bioimage.yml"

In [7]:
# Load the URLs from the YAML file
existing_urls = set()
if os.path.exists(yaml_file):
    yaml_data = read_yaml_file(yaml_file)
    for item in yaml_data.values():
        if "url" in item:
            existing_urls.add(item["url"])

# Compare newsletter links with YAML file URLs
new_links = {pdf: [link for link in links if link not in existing_urls] for pdf, links in newsletter_links.items()}

# Save new links only to a separate file, if needed
new_links_file = "newsletters/newsletter_links.txt"
with open(new_links_file, "w") as f:
    f.write("New Links to Add\n")
    f.write("=" * 20 + "\n\n")
    for pdf_file, links in new_links.items():
        if links:
            f.write(f"Newsletter: {pdf_file}\n")
            f.write("-" * 20 + "\n")
            for link in links:
                f.write(f"- {link}\n")
            f.write("\n")

print(f"\nNew links saved to {new_links_file}.")


New links saved to newsletters/newsletter_links.txt.
