# Newsletter import

## 1.) Access Newsletter

In [1]:
import requests
from bs4 import BeautifulSoup #lol
import os
import re
import pypdfium2 as pdfium
from generate_link_lists import read_yaml_file

In [2]:
# URL of the newsletter archive
url = "https://nfdi4bioimage.de/?id=157062#c896618"

# Send a GET request to fetch the webpage
response = requests.get(url)
response.raise_for_status()  # Raise an exception for HTTP errors

# Parse the webpage using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links that point to PDF files
pdf_links = []
for a_tag in soup.find_all('a', href=True):
    href = a_tag['href']
    if href.endswith('.pdf'):
        # If the link is relative, make it absolute
        if not href.startswith('http'):
            href = requests.compat.urljoin(url, href)
        pdf_links.append(href)

# Print the extracted PDF links
print("Found PDF links:")
for pdf_link in pdf_links:
    print(pdf_link)

Found PDF links:
https://nfdi4bioimage.de/securedl/sdl-eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3Mzc0NjUzNDUsImV4cCI6MTczODE1NjU0NSwidXNlciI6MCwiZ3JvdXBzIjpbMCwtMV0sImZpbGUiOiJmaWxlYWRtaW4vYmlvbG9naWUvYmlvaW1hZ2luZy9uZmRpNGJpb2ltYWdlL0RhdGVpZW5fenVtX0Rvd25sb2FkLzIwMjQxMjE5X05GREk0QklPSU1BR0VfTmV3c2xldHRlcl8tX0RlY2VtYmVyXzE5dGhfMjAyNC5wZGYiLCJwYWdlIjoxNTcwNjJ9.qg6ifNSs4MHkVMucwZKqc5Z_z7SkNjTvj0GBcoeiR2I/20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf
https://nfdi4bioimage.de/securedl/sdl-eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3Mzc0NjUzNDUsImV4cCI6MTczODE1NjU0NSwidXNlciI6MCwiZ3JvdXBzIjpbMCwtMV0sImZpbGUiOiJmaWxlYWRtaW4vYmlvbG9naWUvYmlvaW1hZ2luZy9uZmRpNGJpb2ltYWdlL0RhdGVpZW5fenVtX0Rvd25sb2FkLzIwMjQwOTEwX05GREk0QklPSU1BR0VfTmV3c2xldHRlcl8tX1NlcHRlbWJlcl8xMHRoXzIwMjQucGRmIiwicGFnZSI6MTU3MDYyfQ.IGdH3815D5lHxHBiTo1BjN78ia1CC_7RI73RA29WwPQ/20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf
https://nfdi4bioimage.de/securedl/sdl-eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1

In [3]:
# Directory to save downloaded PDFs
output_dir = "newsletters"
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Download each PDF file
for pdf_link in pdf_links:
    pdf_name = pdf_link.split('/')[-1]  # Extract the file name from the URL
    pdf_path = os.path.join(output_dir, pdf_name)

    # Download and save the PDF
    print(f"Downloading {pdf_name}...")
    pdf_response = requests.get(pdf_link)
    pdf_response.raise_for_status()  # Raise an exception for HTTP errors

    with open(pdf_path, 'wb') as pdf_file:
        pdf_file.write(pdf_response.content)

    print(f"Saved to {pdf_path}")

print("All PDFs downloaded!")

Downloading 20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf...
Saved to newsletters/20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf
Downloading 20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf...
Saved to newsletters/20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf
Downloading 20240617_NFDI4BIOIMAGE_Newsletter_-_June_17th_2024.pdf...
Saved to newsletters/20240617_NFDI4BIOIMAGE_Newsletter_-_June_17th_2024.pdf
Downloading 20240301_NFDI4BIOIMAGE_Anniversary_Newsletter_-_March_1st_2024.pdf...
Saved to newsletters/20240301_NFDI4BIOIMAGE_Anniversary_Newsletter_-_March_1st_2024.pdf
Downloading 2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf...
Saved to newsletters/2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf
Downloading 2023-09-08_NFDI4BIOIMAGE_Newsletter.pdf...
Saved to newsletters/2023-09-08_NFDI4BIOIMAGE_Newsletter.pdf
Downloading 2023-06-06_NFDI4BIOIMAGE_Newsletter.pdf...
Saved to newsletters/2023-06-06_NFDI4BIOIMAGE_Newsletter.pdf
Downloading 2023-03-13_NFD

In [4]:
# Directory containing the downloaded PDFs
pdf_dir = "newsletters"

# Regex pattern to match URLs
url_pattern = r"https?://[^\s<>\"']+"

# Output: a single list to store all URLs
newsletter_links = []

# Iterate over all PDF files in the directory
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):  # Only process PDF files
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"Extracting URLs from: {pdf_file}")

        try:
            # Open the PDF file
            pdf = pdfium.PdfDocument(pdf_path)

            # Loop through all pages
            for page_number in range(len(pdf)):
                page = pdf[page_number]
                text = page.get_textpage().get_text_range()  # Extract text from the page

                # Find all URLs in the text using regex
                urls = re.findall(url_pattern, text)
                newsletter_links.extend(urls)  # Add found URLs to the global list

        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

# Remove duplicates from the global list
newsletter_links = list(set(newsletter_links))

# Print all extracted URLs in a structured way
print("\nExtracted URLs:")
for url in newsletter_links:
    print(f"- {url}")

# If you want to check the variable explicitly:
print("\nVariable `newsletter_links` contains all URLs:")

Extracting URLs from: 2023-12-20_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 2022-11-04_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 2023-09-08_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 20241219_NFDI4BIOIMAGE_Newsletter_-_December_19th_2024.pdf
Extracting URLs from: 2022-06-28_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 20240301_NFDI4BIOIMAGE_Anniversary_Newsletter_-_March_1st_2024.pdf
Extracting URLs from: 20240910_NFDI4BIOIMAGE_Newsletter_-_September_10th_2024.pdf
Extracting URLs from: 2023-06-06_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 2023-03-13_NFDI4BIOIMAGE_Newsletter.pdf
Extracting URLs from: 20240617_NFDI4BIOIMAGE_Newsletter_-_June_17th_2024.pdf

Extracted URLs:
- https://nfdi4bioimage.de/resources-and-services/wall-calendar/
- https://fairdo.org/fdof-summit-2024/
- https://indico3.mpimagdeburg.mpg.de/event/31/
- https://doi.org/10.1242/jcs.262322
- https://globalbioimaging.org/exchange-of-experience/exchange-of-experience-ix
- https://ww



In [5]:
newsletter_links

['https://nfdi4bioimage.de/resources-and-services/wall-calendar/',
 'https://fairdo.org/fdof-summit-2024/',
 'https://indico3.mpi\x02magdeburg.mpg.de/event/31/',
 'https://doi.org/10.1242/jcs.262322',
 'https://globalbioimaging.org/exchange-of-experience/exchange-of-experience-ix',
 'https://www.focusonmicroscopy.org/',
 'https://saxfdm.de/veranstaltungen/digital-kitchen-nfdi4bioimage-scads-ai/',
 'https://doi.org/10.1038/d41586-023-03064-9',
 'https://doi.org/10.5281/zenodo.11235513',
 'https://doi.org/10.5281/zenodo.10617006),',
 'https://www.i2kconference.org/',
 'https://zenodo.org/communities/nfdi4biodiv?q=&l=list&p=1&s=10&sort=newest',
 'https://nfdi4bioimage.de/help-desk/',
 'https://converia.uni\x02mainz.de/frontend/index.php?page_id=4538',
 'https://gbm-compact.org/gbm\x02compact-2.html',
 'https://www.denbi.de/de-nbi-events/1547-biohackathon-germany-2',
 'https://forum.image.sc/t/community-call-metadata-in-ome\x02ngff/77570',
 'https://www.c-linkage.co.jp/iupab2024-',
 'https

In [6]:
yaml_file = "../resources/nfdi4bioimage.yml"

In [8]:
# Load the URLs from the YAML file
existing_urls = set()
if os.path.exists(yaml_file):
    yaml_data = read_yaml_file(yaml_file)
    for item in yaml_data.values():
        if "url" in item:
            existing_urls.add(item["url"])

# Compare newsletter links with YAML file URLs
new_links = [link for link in newsletter_links if link not in existing_urls]

# Print or process the new links as needed
if new_links:
    print("\nNew links to add:")
    for link in new_links:
        print(link)
else:
    print("\nNo new links found.")


New links to add:
https://nfdi4bioimage.de/resources-and-services/wall-calendar/
https://fairdo.org/fdof-summit-2024/
https://indico3.mpimagdeburg.mpg.de/event/31/
https://doi.org/10.1242/jcs.262322
https://globalbioimaging.org/exchange-of-experience/exchange-of-experience-ix
https://www.focusonmicroscopy.org/
https://saxfdm.de/veranstaltungen/digital-kitchen-nfdi4bioimage-scads-ai/
https://doi.org/10.1038/d41586-023-03064-9
https://doi.org/10.5281/zenodo.11235513
https://doi.org/10.5281/zenodo.10617006),
https://www.i2kconference.org/
https://zenodo.org/communities/nfdi4biodiv?q=&l=list&p=1&s=10&sort=newest
https://nfdi4bioimage.de/help-desk/
https://converia.unimainz.de/frontend/index.php?page_id=4538
https://gbm-compact.org/gbmcompact-2.html
https://www.denbi.de/de-nbi-events/1547-biohackathon-germany-2
https://forum.image.sc/t/community-call-metadata-in-omengff/77570
https://www.c-linkage.co.jp/iupab2024-
https://doi.org/10.1101/2023.05.05.539647
https://forum.image.sc/t/ome20