In [1]:
import json
from urllib.parse import urlparse
import requests
from tqdm import tqdm
import magic

We first remove all duplicate urls.

In [20]:
# Input file with URLs
input_file = "all_urls.json"
output_file = "cleaned_urls.json"

# Load the list of URLs
with open(input_file, "r") as file:
    urls = json.load(file)

# Remove duplicates by converting to a set and back to a list
unique_urls = list(set(urls))

# Sort the URLs for consistency
unique_urls.sort()

# Save the cleaned list to a new JSON file
with open(output_file, "w") as file:
    json.dump(unique_urls, file, indent=4)

print(f"Duplicate URLs removed. Cleaned list saved to {output_file}")
print(f"There are {len(unique_urls)} unique URLs in the list.")


Duplicate URLs removed. Cleaned list saved to cleaned_urls.json
There are 2158 unique URLs in the list.


Eliminate youtube urls, urls that cannot be accessed, urls of pages that cannot be found or insecure urls.

In [23]:
input_file = "cleaned_urls.json"
output_file = "cleaned_urls.json"

# Load the list of URLs
with open(input_file, "r") as file:
    urls = json.load(file)

unwanted_urls = ["https://youtu.be/Qlzyp6QiUyA",
                 "http://eacea.ec.europa.eu/bilateral_cooperation/index_en.php",
                 "http://eacea.ec.europa.eu/erasmus_mundus/",
                 "http://eacea.ec.europa.eu/tempus/",
                 "http://www.elections2014.eu/en/",
                 "http://www.elections2014.eu/en/in-the-member-states",
                 "http://www.elections2014.eu/en/in-the-member-states/european-union",
                 "http://www.elections2014.eu/en/new-commission/hearings/by-committee",
                 "http://www.elections2014.eu/en/new-commission/portfolios-and-candidates",
                 "http://www.elections2014.eu/en/new-parliament",
                 "http://www.elections2014.eu/en/news-room/content/20140918IFG65303/html/Infographic-how-the-European-Commission-will-get-elected",
                 "http://www.elections2014.eu/en/press-kit/content/20131112PKH24411/html/Overview-of-Parliament-and-the-2014-elections",
                 "https://europa.eu/eyd2015/",
                 "https://www.avrupa.info.tr/en"]
new_urls = [url for url in urls if url not in unwanted_urls]
with open(output_file, 'w') as file:
        json.dump(new_urls, file, indent=4)

In [24]:
# Input file with URLs
input_file = "cleaned_urls.json"
output_file = "prefix_counts.json"

# Load the list of URLs
with open(input_file, "r") as file:
    urls = json.load(file)

# Extract prefixes and count their occurrences
prefix_counts = {}
for url in urls:
    # Parse the URL to extract the scheme and netloc
    parsed_url = urlparse(url)
    prefix = f"{parsed_url.scheme}://{parsed_url.netloc}"
    prefix_counts[prefix] = prefix_counts.get(prefix, 0) + 1

# Calculate the total count
total_count = sum(prefix_counts.values())

# Sort prefixes by counts in descending order
sorted_prefix_counts = sorted(prefix_counts.items(), key=lambda x: x[1], reverse=True)

# Convert to a dictionary for saving (optional, as JSON supports list of tuples)
sorted_prefix_counts_dict = {k: v for k, v in sorted_prefix_counts}

# Save the sorted counts to a JSON file
with open(output_file, "w") as file:
    json.dump(sorted_prefix_counts_dict, file, indent=4)

# Print the total count and save confirmation
print(f"Total number of URLs: {total_count}")
print(f"Sorted prefix counts saved to {output_file}")

Total number of URLs: 2144
Sorted prefix counts saved to prefix_counts.json


In [25]:
# Input file with URLs
input_file = "cleaned_urls.json"

# Load the list of URLs
with open(input_file, "r") as file:
    urls = json.load(file)

# Filter and print URLs that do not start with "http"
invalid_urls = [url for url in urls if not url.startswith("http")]

# Print the invalid URLs
if invalid_urls:
    print("URLs that do not start with 'http':")
    for url in invalid_urls:
        print(url)
else:
    print("All URLs start with 'http'.")

All URLs start with 'http'.


In [4]:
# load URLs from a file
def load_urls(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# save URLs to a file
def save_urls(file_path, urls):
    with open(file_path, 'w') as file:
        json.dump(urls, file, indent=4)

def is_pdf(url):
    try:
        response = requests.get(url, stream=True, timeout=10, allow_redirects=True)
        response.raise_for_status()  # Raise an error for HTTP issues

        # Debug: Check headers
        content_type = response.headers.get('Content-Type', '')
        print(f"URL: {url} | Content-Type: {content_type}")

        # Check for PDF MIME type in headers
        if 'application/pdf' in content_type:
            return True

        # Read magic number from the first few bytes
        first_bytes = response.raw.read(1024)  # Read the first 1KB for better accuracy
        if first_bytes.startswith(b'%PDF'):
            return True

        return False  # Not a PDF if neither MIME nor magic number matches

    except requests.exceptions.RequestException as e:
        # Debug: Log HTTP errors
        print(f"HTTP error for URL: {url} | Error: {e}")
        raise e
    except Exception as e:
        # Debug: Log other errors
        print(f"Error for URL: {url} | Error: {e}")
        raise e

In [None]:
input_file = 'cleaned_urls.json'
pdf_file = 'pdf_urls.json'
nonpdf_file = 'nonpdf_urls.json'
error_file = 'error_urls.json'

# Load the URLs
urls = load_urls(input_file)

# Initialize lists for categorization
pdf_urls = []
nonpdf_urls = []
error_urls = []

# Iterate through URLs and categorize
for url in tqdm(urls):
    try:
        if is_pdf(url):
            pdf_urls.append(url)
        else:
            nonpdf_urls.append(url)
    except requests.exceptions.RequestException as e:
        error_urls.append({"url": url, "error": str(e)})
    except Exception as e:
        error_urls.append({"url": url, "error": str(e)})

# Save categorized URLs
save_urls(pdf_file, pdf_urls)
save_urls(nonpdf_file, nonpdf_urls)
save_urls(error_file, error_urls)