In [1]:
from selectolax.parser import HTMLParser

import urllib.request
import urllib.error
from typing import Optional


In [2]:
def fetch_content(url: str, timeout: int = 10) -> Optional[str]:
    """
    Fetch content from a URL using built-in Python libraries.
    
    This function uses urllib.request to fetch web page content and automatically
    handles redirects. It sets appropriate headers to mimic a real browser.
    
    Args:
        url: The URL to fetch content from
        timeout: Request timeout in seconds (default: 10)
        
    Returns:
        The page content as string, or None if an error occurs
        
    Raises:
        ValueError: If the URL is invalid
        urllib.error.URLError: For network-related errors
        urllib.error.HTTPError: For HTTP errors (404, 500, etc.)
    """
    if not url or not isinstance(url, str):
        raise ValueError("URL must be a non-empty string")
    
    # Set up headers to mimic a real browser
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
            '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ),
        'Accept': (
            'text/html,application/xhtml+xml,application/xml;q=0.9,'
            'image/webp,*/*;q=0.8'
        ),
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    # try:
    # Create request with headers
    request = urllib.request.Request(url, headers=headers)
    
    # Open URL with timeout - urllib automatically handles redirects
    with urllib.request.urlopen(request, timeout=timeout) as response:
        # Read and decode the response
        content = response.read()
        
        # Try to detect encoding from headers, fallback to utf-8
        encoding = response.headers.get_content_charset() or 'utf-8'
        
        # Decode the content
        decoded_content = content.decode(encoding, errors='replace')
        
        return decoded_content
            
    # except urllib.error.HTTPError as e:
    #     # Handle HTTP errors (404, 500, etc.)
    #     print(f"HTTP Error {e.code}: {e.reason} for URL: {url}")
    #     return None
    # except urllib.error.URLError as e:
    #     # Handle URL errors (network issues, invalid URL, etc.)
    #     print(f"URL Error: {e.reason} for URL: {url}")
    #     return None
    # except ValueError as e:
    #     # Handle invalid URL format
    #     print(f"Value Error: {e} for URL: {url}")
    #     return None
    # except Exception as e:
    #     # Handle any other unexpected errors
    #     print(f"Unexpected error: {e} for URL: {url}")


In [15]:

from curl_cffi import requests as creq


def fetch_content(url: str, timeout: int = 15) -> Optional[str]:
    """
    Fetch the Inspector page using TLS/HTTP2 + Chrome impersonation.
    Returns HTML if successful, or None if a challenge page is detected.
    """
    print(url)
    # try:
    resp = creq.get(
        url,
        timeout=timeout,
        allow_redirects=True,
        impersonate="chrome",  # try "chrome124", "chrome120" if needed
    )
    if resp.status_code != 200:
        return None

    ctype = resp.headers.get("content-type", "").lower()
    if "text/html" not in ctype:
        return None

    text = resp.text
    # Heuristic: avoid bot/challenge splash
    if "Client Challenge" in text or "/_fs-ch-" in text:
        return None
    return text
    # except Exception:
    #     return None


In [4]:
package_name = "liburlparser"

In [5]:
pypi_browser_url = "https://pypi-browser.org/package/"

In [6]:
content = fetch_content(pypi_browser_url+package_name+"/")

print(content)

https://pypi-browser.org/package/liburlparser/


<!doctype html>
<html lang="en">
    <head>
        <meta charset="utf-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <link rel="icon" href="https://pypi-browser.org/static/favicon.png" />
        <link rel="stylesheet" href="https://pypi-browser.org/static/bootstrap-5.2.1.min.css" />
        <link rel="stylesheet" href="https://pypi-browser.org/static/site.css" />
        <title>liburlparser | PyPI Browser</title>
        
    </head>
    <body class="page-package">
        <nav class="navbar navbar-dark bg-dark">
            <div class="container">
                <a class="navbar-brand" href="https://pypi-browser.org/">PyPI Browser</a>
                <form class="d-flex" method="GET" action="https://pypi-browser.org/search">
                   <input
                       id="package-search"
                       class="form-control me-2"
                       type="text"
             

In [7]:
# cards[0].css_first(".card-header").text().strip()


In [8]:
tree = HTMLParser(content)
cards = tree.css(".card")

package_tags = [{
    "tag": card.css_first(".card-header").text().strip(),
    "wheels": [{
        "name": a.css_first("span").text().strip(),
        "url": a.attributes["href"],
    } for a in card.css_first(".list-group").css("a")]
} for card in tree.css(".card")]


from pprint import pprint
pprint(package_tags)

[{'tag': '1.6.0',
  'wheels': [{'name': 'liburlparser-1.6.0-cp310-cp310-win32.whl',
              'url': 'https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp310-cp310-win32.whl'},
             {'name': 'liburlparser-1.6.0-cp310-cp310-win_amd64.whl',
              'url': 'https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp310-cp310-win_amd64.whl'},
             {'name': 'liburlparser-1.6.0-cp311-cp311-win32.whl',
              'url': 'https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp311-cp311-win32.whl'},
             {'name': 'liburlparser-1.6.0-cp311-cp311-win_amd64.whl',
              'url': 'https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp311-cp311-win_amd64.whl'},
             {'name': 'liburlparser-1.6.0-cp312-cp312-win32.whl',
              'url': 'https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp312-cp312-win32.whl'},
             {'name': 'liburlparser-1.6.0-cp312-cp312-win_amd64.whl',
      

In [9]:
wheel1 = package_tags[0]["wheels"][1]
print(repr(wheel1))

{'name': 'liburlparser-1.6.0-cp310-cp310-win_amd64.whl', 'url': 'https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp310-cp310-win_amd64.whl'}


In [10]:

content = fetch_content(wheel1["url"])
print(content)



https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp310-cp310-win_amd64.whl


<!doctype html>
<html lang="en">
    <head>
        <meta charset="utf-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <link rel="icon" href="https://pypi-browser.org/static/favicon.png" />
        <link rel="stylesheet" href="https://pypi-browser.org/static/bootstrap-5.2.1.min.css" />
        <link rel="stylesheet" href="https://pypi-browser.org/static/site.css" />
        <title>liburlparser-1.6.0-cp310-cp310-win_amd64.whl | liburlparser | PyPI Browser</title>
        
    </head>
    <body class="page-package-file">
        <nav class="navbar navbar-dark bg-dark">
            <div class="container">
                <a class="navbar-brand" href="https://pypi-browser.org/">PyPI Browser</a>
                <form class="d-flex" method="GET" action="https://pypi-browser.org/search">
                   <input
                       id="package-search"
 

In [11]:
wheel_url = wheel1["url"]
dist_info = "-".join(wheel1["name"].split("-")[:2]) + ".dist-info"


In [20]:
metadata_url = f"{wheel_url}/{dist_info}/METADATA"
content = fetch_content(metadata_url)
print(content)

https://pypi-browser.org/package/liburlparser/liburlparser-1.6.0-cp310-cp310-win_amd64.whl/liburlparser-1.6.0.dist-info/METADATA


<!doctype html>
<html lang="en">
    <head>
        <meta charset="utf-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <link rel="icon" href="https://pypi-browser.org/static/favicon.png" />
        <link rel="stylesheet" href="https://pypi-browser.org/static/bootstrap-5.2.1.min.css" />
        <link rel="stylesheet" href="https://pypi-browser.org/static/site.css" />
        <title>liburlparser-1.6.0.dist-info/METADATA | liburlparser-1.6.0-cp310-cp310-win_amd64.whl | liburlparser | PyPI Browser</title>
        
    <style>
        .fluffy-code {
    font-family: monospace;
}

.fluffy-code .line-numbers {
    padding: 2px 0;
    border-right-width: 1px;
    border-right-style: solid;

    float: left;
    line-height: 1.25em;
    text-align: right;
}

.fluffy-code .line-numbers a {
    cursor: pointer;
    display:

In [28]:
tree = HTMLParser(content)
metadata_str = tree.css_first("pre").text()
print(metadata_str)

Metadata-Version: 2.1
Name: liburlparser
Version: 1.6.0
Summary: Fastest Url parser in the world
Author-Email: Mohammad Raziei <mohammadraziei1375@gmail.com>
Classifier: Development Status :: 4 - Beta
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: C++
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Project-URL: Homepage, https://github.com/mohammadraziei/liburlparser
Project-URL: Bug Tracker, https://github.com/mohammadraziei/liburlparser/issues
Project-URL: Document, https://mohammadraziei.github.io/liburlparser/latest
Requires-Python: >=3.8
Requires-Dist: filelock
Provide

In [36]:
import re
from typing import Dict, List, Union

def parse_metadata(metadata_text: str) -> Dict[str, Union[str, List[str]]]:
    """
    Parse a Python package METADATA file into a structured dictionary.
    Stops header parsing at the first blank line (start of description).
    """
    result: Dict[str, Union[str, List[str]]] = {}
    lines = metadata_text.strip().splitlines()
    header_lines = []
    description_lines = []
    in_description = False

    # Separate header and description parts
    for line in lines:
        if not in_description:
            if line.strip() == "":
                in_description = True
                continue
            header_lines.append(line)
        else:
            description_lines.append(line)

    # Parse metadata headers
    key = None
    for line in header_lines:
        # Continuation line (starts with space)
        if line.startswith(' ') and key:
            value = line.strip()
            if isinstance(result[key], list):
                result[key][-1] += ' ' + value
            elif isinstance(result[key], str):
                result[key] += ' ' + value
            continue

        # Match key-value
        match = re.match(r'^([A-Za-z0-9-]+):\s*(.*)$', line)
        if match:
            key, value = match.groups()
            key, value = key.strip(), value.strip()
            multi_fields = {
                "Classifier", "Requires-Dist", "Provides-Extra",
                "Project-URL", "License-File"
            }

            if key in result:
                if not isinstance(result[key], list):
                    result[key] = [result[key]]
                result[key].append(value)
            else:
                result[key] = [value] if key in multi_fields else value

    # Add description if present
    if description_lines:
        result["Description"] = "\n".join(description_lines).strip()

    return result



In [86]:
parsed_metadata = parse_metadata(metadata_str)

In [94]:
import re
from typing import List, Dict, Optional, Tuple, Any

from dataclasses import dataclass 

@dataclass 
class Dependency: 
    package: str 
    condition: Optional[str] = None 
    def __repr__(self):
        return self.package + ("" if self.condition is None else self.condition)
    

# Match: semicolon + optional spaces + extra == "something"
_EXTRA_RE = re.compile(r';\s*extra\s*==\s*[\'"]([^\'"]+)[\'"]', re.IGNORECASE)

def extract_dependencies(metadata: Dict[str, Any]) -> Tuple[List["Dependency"], Dict[str, List["Dependency"]]]:
    """
    Extract dependencies from METADATA:
      - package = only the bare name
      - condition = version + markers (minus the `extra == ...` part)
      - optional deps grouped by their extra name
    """
    dependencies: List[Dependency] = []
    optional_dependencies: Dict[str, List[Dependency]] = {}

    requires_dist = metadata.get("Requires-Dist", [])
    if isinstance(requires_dist, str):
        requires_dist = [requires_dist]

    for req in requires_dist:
        req = req.strip()
        if not req:
            continue

        # Detect extras (and remove the entire "; extra == ..." part)
        extras = _EXTRA_RE.findall(req)
        req_clean = _EXTRA_RE.sub("", req).strip()

        # Split package name and condition
        match = re.match(r"^([A-Za-z0-9_.\-]+)\s*(.*)$", req_clean)
        if not match:
            continue

        pkg, rest = match.groups()
        condition = rest.strip() or None

        dep = Dependency(pkg, condition)

        if extras:
            for extra in extras:
                optional_dependencies.setdefault(extra, []).append(dep)
        else:
            dependencies.append(dep)

    return dependencies, optional_dependencies


In [95]:
dependencies, optional_dependencies = extract_dependencies(parsed_metadata)

print(dependencies)
print(optional_dependencies)

[filelock, examplepkg>=1.0; python_version >= '3.8' and sys_platform != 'win32']
{'test': [pytest-xdist, requests], 'online': [requests]}


In [96]:
optional_dependencies["test"]

[pytest-xdist, requests]

In [97]:
parsed_metadata = {
    "Requires-Dist": [
        "filelock",
        "pytest-xdist; extra == 'test'",
        "requests; extra == 'test'",
        "requests; extra == 'online'",
        "examplepkg >=1.0; python_version >= '3.8' and sys_platform != 'win32'",
    ]
}

deps, opt = extract_dependencies(parsed_metadata)
print("Dependencies:", deps)
print("Optional deps:", opt)

Dependencies: [filelock, examplepkg>=1.0; python_version >= '3.8' and sys_platform != 'win32']
Optional deps: {'test': [pytest-xdist, requests], 'online': [requests]}


In [99]:
deps[1].condition

">=1.0; python_version >= '3.8' and sys_platform != 'win32'"