### Load necessary libraries
We will use `yaml` to parse the YAML file and `collections.Counter` to count URL occurrences.

In [1]:
import yaml
from collections import Counter
from pathlib import Path
import os

### Define a function to extract URLs from the YAML file
The function will recursively navigate through the nested data structure and collect all URLs.

In [2]:
def extract_urls(data):
    urls = []
    if isinstance(data, dict):
        for key, value in data.items():
            if key == "url":
                if isinstance(value, list):
                    urls.extend(value)
                elif isinstance(value, str):
                    urls.append(value)
            else:
                urls.extend(extract_urls(value))
    elif isinstance(data, list):
        for item in data:
            urls.extend(extract_urls(item))
    return urls

### Load the YAML file
Parse the YAML file to prepare the data for URL extraction.

In [3]:
yml_file = Path("../resources/nfdi4bioimage.yml")
with open(yml_file, 'r') as file:
    data = yaml.safe_load(file)

### Extract all URLs from the loaded data
Use the `extract_urls` function to collect all URLs within the YAML structure.

In [4]:
urls = extract_urls(data)

### Identify duplicate URLs
Use `collections.Counter` to count occurrences of each URL and filter duplicates.

In [5]:
duplicates = [url for url, count in Counter(urls).items() if count > 1]

### Save duplicate URLs to a file
Save the duplicate URLs to a text file for further inspection.

In [6]:
output_file = Path("../results/duplicate_urls.txt")
os.makedirs(output_file.parent, exist_ok=True)
with open(output_file, 'w') as file:
    file.write("\n".join(duplicates))