In [None]:
import requests
from bs4 import BeautifulSoup
import re
import io

# If it's a PDF:
from pdfminer.high_level import extract_text

def download_content(url):
    """
    Download content from the URL. Return tuple (content_bytes, content_type)
    """
    resp = requests.get(url)
    resp.raise_for_status()
    content_type = resp.headers.get('Content-Type', '')
    return resp.content, content_type

def parse_html(content_bytes):
    """
    Parse HTML content to find product descriptions and tax codes.
    Adjust selectors depending on the page structure.
    """
    soup = BeautifulSoup(content_bytes, 'html.parser')
    results = []
    # Example: assume each product is in a <div class="product"> (adjust as needed)
    for prod in soup.find_all(class_='product'):
        desc = prod.find(class_='description')
        tax = prod.find(class_='tax-code')
        if desc and tax:
            results.append({
                'description': desc.get_text(strip=True),
                'tax_code': tax.get_text(strip=True)
            })
    return results

def parse_pdf(content_bytes):
    """
    Parse PDF content to find product descriptions and tax codes.
    We'll extract all text, then use regex or heuristics to pull out relevant parts.
    """
    text = extract_text(io.BytesIO(content_bytes))
    results = []
    # Example heuristic: assume tax codes are numeric codes, e.g. digits with optional separators
    # And descriptions are lines preceding or following them.
    # Adjust regex as per actual document format.
    # For example:
    #   Product: Widget A
    #   Tax Code: 12345
    
    # Pattern to find lines like "Tax code: 12345" (case insensitive)
    tax_pattern = re.compile(r'Tax\s*Code\D*(\d+)', re.IGNORECASE)
    # Maybe description lines could be like "Description: ...."
    desc_pattern = re.compile(r'Description\D*(.+)', re.IGNORECASE)
    
    # Split text into lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        m_tax = tax_pattern.search(line)
        if m_tax:
            tax_code = m_tax.group(1)
            # try to find description nearby
            desc = None
            # check prior lines
            if i > 0:
                m_desc_prev = desc_pattern.search(lines[i-1])
                if m_desc_prev:
                    desc = m_desc_prev.group(1)
            # or check following lines
            if desc is None and i + 1 < len(lines):
                m_desc_next = desc_pattern.search(lines[i+1])
                if m_desc_next:
                    desc = m_desc_next.group(1)
            # if still none, maybe the description is a few lines above, fallback
            if desc is None:
                # just take the line before tax
                desc = lines[i-1] if i > 0 else ''
            results.append({
                'description': desc,
                'tax_code': tax_code
            })
    return results

def main():
    url = 'https://publications.europa.eu/resource/cellar/bb24a915-9729-11ef-a130-01aa75ed71a1.0006.03/DOC_1'
    content, content_type = download_content(url)
    
    # decide parser
    if 'application/pdf' in content_type or url.lower().endswith('.pdf'):
        parsed = parse_pdf(content)
    else:
        parsed = parse_html(content)
    
    # Output results
    for item in parsed:
        print(f"Description: {item['description']}")
        print(f"Tax code: {item['tax_code']}")
        print('---')

if __name__ == '__main__':
    main()