# Introduction
* Purpose: Create a local ChromaDB vector store with existing EU legal acts from S3 bucket
* Python Environment: regpy10ch

# Libraries

In [23]:
import pandas as pd
import numpy as np
import os
import json
import boto3
import chromadb
import logging
import traceback
from tqdm import tqdm
from datetime import datetime
from bs4 import BeautifulSoup
from multiprocessing import Pool
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
import hashlib
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader, UnstructuredXMLLoader
from langchain_huggingface import HuggingFaceEmbeddings

def extract_mt_code(mt):
    if pd.isna(mt):
        return None
    else:
        return mt[0:5]
    
def extract_mt_term(mt):
    if pd.isna(mt):
        return None
    else:
        return mt[5:]

def decompose_celex(celex):
    sector = celex[0]
    year = celex[1:5]
    return sector, year

def make_id(doc_path):
    return hashlib.md5(doc_path[32:].encode("utf-8")).hexdigest()[:12]

In [3]:
AWS_ACCESS_KEY_ID = os.getenv("S3_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("S3_SECRET_ACCESS_KEY")
bucket_name = 'regguru'
# Create an S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

la_object = 'eu/LEG_EN_HTML_20250721_04_08'
mtd_object = 'eu/LEG_MTD_20250709_22_36'

In [4]:
la_mtd_file = os.getenv("EU_LEGAL_ACT_METADATA_FILE")
la_mtd = pd.read_csv(la_mtd_file)
la_mtd["MT-code"] = la_mtd["MT"].apply(extract_mt_code)
la_mtd["MT-term"] = la_mtd["MT"].apply(extract_mt_term)
la_mtd['doc-path'] = "eu/LEG_EN_HTML_20250721_04_08/" + la_mtd['work-id'] + "/" + la_mtd['format'] + "/" + la_mtd['doc']
la_mtd['mtd-path'] = "eu/LEG_MTD_20250709_22_36/" + la_mtd['work-id'] + "/tree_non_inferred.rdf"

In [5]:
ch_client = chromadb.PersistentClient(path="./chroma_data")

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Slightly larger chunks
    chunk_overlap=200
)


# Test embed documents

In [None]:
# ch_client.delete_collection(name="legal-test")

In [16]:
test_collection = ch_client.get_or_create_collection(name="legal-test") # default embedder: all-MiniLM-L6-v2
documents = la_mtd.drop_duplicates('doc-path').iloc[0:5]

In [17]:
for idx, row in tqdm(documents.iterrows(), desc="Processing documents", total=documents.shape[0]):
    # Load document from S3
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=row["doc-path"])
        html_content = obj['Body'].read().decode('utf-8')  # HTML as string
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")  # plain text
    except Exception as e:
        logging.error("%d | Error loading %s:\n%s", idx, row["doc-path"], traceback.format_exc())
        print(f"{idx} | Error loading {row['doc-path']}")
        continue
    
    # Extract metadata
    try:
        meta = {}
        if soup.title:
            meta['title'] = soup.title.string
        for m in soup.find_all("meta"):
            if m.get("name") and m.get("content"):
                meta[m["name"].lower()] = m["content"]
        meta_download = la_mtd[(la_mtd["work-id"] == row["work-id"]) & (la_mtd["doc"] == row["doc"])]
        terms = meta_download["TERMS (PT-NPT)"].dropna().astype(str).unique()
        meta['eurovoc-terms'] = ';'.join(terms)
        mts = meta_download["MT"].dropna().astype(str).unique()
        meta['eurovoc-mt'] = ';'.join(mts)

        sector, year = decompose_celex(row['celex']) if row['celex'] else (None, None)
        meta['celex'] = row['celex'] if row['celex'] else None
        meta['celex-sector'] = sector # a character
        meta['celex-year'] = int(year) # a 4-digit number
    except Exception as e:
        logging.error("%d | Error extracting metadata %s:\n%s", idx, row["doc-path"], traceback.format_exc())
        print(f"{idx} | Error extracting metadata {row['doc-path']}")


    # Split, embed, and store
    try:
        chunks = text_splitter.split_text(text)
        texts = [chunk for chunk in chunks]
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
        test_collection.add(
            documents=texts,
            metadatas=[meta for _ in range(len(texts))],
            embedding=embeddings.embed_documents(texts),
            ids=[f"{make_id(row['doc-path'])}_{i}" for i in range(len(texts))]
        )
    except Exception as e:
        logging.error("%d | Error embedding %s:\n%s", idx, row["doc-path"], traceback.format_exc())
        print(f"{idx} | Error embedding {row['doc-path']}")
        continue

Processing documents:  20%|██        | 1/5 [00:06<00:24,  6.17s/it]

0 | Error embedding eu/LEG_EN_HTML_20250721_04_08/1a1e8486-a474-11e9-9d01-01aa75ed71a1/xhtml/L_2019187EN.01004101.doc.html


Processing documents:  40%|████      | 2/5 [00:10<00:15,  5.18s/it]

6 | Error embedding eu/LEG_EN_HTML_20250721_04_08/5fa72f58-9564-4ebe-a5a5-853e206ae2ed/html/32000R0212en.html


Processing documents:  60%|██████    | 3/5 [00:15<00:09,  4.86s/it]

18 | Error embedding eu/LEG_EN_HTML_20250721_04_08/65a9ddff-3e78-403b-ab34-98d7973be2b3/xhtml/L_2010117EN.01006001.doc.html


Processing documents:  80%|████████  | 4/5 [00:19<00:04,  4.60s/it]

26 | Error embedding eu/LEG_EN_HTML_20250721_04_08/39bcdc85-2e3b-4e36-a488-5197ee502afd/html/31999Y0917_02_en.html


Processing documents: 100%|██████████| 5/5 [00:23<00:00,  4.78s/it]

31 | Error embedding eu/LEG_EN_HTML_20250721_04_08/a4d26b15-882d-11e9-9369-01aa75ed71a1/xhtml/L_2019148EN.01000101.doc.html





# Test chroma client

In [108]:
# test = la_mtd.drop_duplicates('doc-path').iloc[0:5]
test_collection = ch_client.get_or_create_collection(name="legal-test") # default embedder: all-MiniLM-L6-v2

In [109]:
print("Number of embeddings:", test_collection.count())
results = test_collection.peek()
print(results)

Number of embeddings: 26
{'ids': ['9000786f2ac2_0', '9000786f2ac2_1', '9000786f2ac2_2', '9000786f2ac2_3', '9000786f2ac2_4', '9000786f2ac2_5', '9000786f2ac2_6', '9000786f2ac2_7', '9000786f2ac2_8', '9000786f2ac2_9'], 'embeddings': array([[-0.03268745,  0.0328047 ,  0.02150211, ...,  0.00085366,
         0.0815249 ,  0.0492176 ],
       [-0.0428882 ,  0.06924605,  0.02441872, ...,  0.05517734,
         0.0789023 ,  0.00538935],
       [ 0.01047072,  0.00346978,  0.01473061, ..., -0.00961938,
         0.04713432,  0.01296261],
       ...,
       [ 0.00838298,  0.03890036, -0.04555006, ...,  0.08372673,
        -0.02075115, -0.00260066],
       [ 0.00876509,  0.00809463,  0.00598892, ...,  0.02392675,
         0.04353177, -0.00210776],
       [ 0.02035374, -0.00467432,  0.01529113, ..., -0.04337729,
         0.07403102,  0.04721521]], shape=(10, 384)), 'documents': ['L_2019187EN.01004101.xml\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n12.7.2019\xa0\xa0\xa0\n\n\n\n\n\n\nEN\n\n\n\n\n\n\nOffi

In [110]:
all_docs = test_collection.get()
for doc in all_docs['metadatas']:
    print(doc)

{'title': 'L_2019187EN.01004101.xml', 'celex-year': 2019, 'celex-sector': '3', 'celex': '32019D1194', 'eurovoc-mt': '5641 fisheries;2841 health;5631 agricultural activity;6811 chemistry;5216 deterioration of the environment;5206 environmental policy', 'eurovoc-terms': 'fish;endocrine disease;fish disease;chemical product;endocrine disruptor;environmental protection'}
{'celex-year': 2019, 'celex-sector': '3', 'celex': '32019D1194', 'title': 'L_2019187EN.01004101.xml', 'eurovoc-terms': 'fish;endocrine disease;fish disease;chemical product;endocrine disruptor;environmental protection', 'eurovoc-mt': '5641 fisheries;2841 health;5631 agricultural activity;6811 chemistry;5216 deterioration of the environment;5206 environmental policy'}
{'title': 'L_2019187EN.01004101.xml', 'celex': '32019D1194', 'eurovoc-mt': '5641 fisheries;2841 health;5631 agricultural activity;6811 chemistry;5216 deterioration of the environment;5206 environmental policy', 'celex-year': 2019, 'celex-sector': '3', 'eurovoc

In [111]:
# Semantic search
results = test_collection.query(
    query_texts=[
        "Which chemical was classified as a substance of very high concern for its endocrine-disrupting effects under the EU Commission Implementing Decision 2019/1194?",
        "On what date did EU Regulation 402/2010 become effective?"],
    n_results=5
)

results["documents"]

[['HAS ADOPTED THIS DECISION:\n\n\n\n\n\n\n\n\nArticle 1\n\n\n\n\n1.\xa0\xa0\xa04-tert-butylphenol (PTBP) (EC No 202-679-0, CAS No 98-54-4) is identified as a substance of very high concern pursuant to Article 57(f) of Regulation (EC) No 1907/2006 due to its endocrine disrupting properties with probable serious effects to the environment which give rise to an equivalent level of concern to those of other substances listed in Article 57(a) to (e) of that Regulation.\n\n\n\n\n\n\n2.\xa0\xa0\xa0The substance referred to in paragraph 1 shall be included in the candidate list referred to in Article 59(1) of Regulation (EC) No 1907/2006 with the following indication under ‘Reason for inclusion’: ‘Endocrine disrupting properties (Article 57(f) - environment)’.\n\n\n\n\n\n\n\n\nArticle 2\n\n\nThis Decision is addressed to the European Chemicals Agency.\n\n\n\n\n\n\n\n\n\n\nDone at Brussels, 5\xa0July 2019.\n\n\n\n\n\n\nFor the Commission\n\n\n\n\nElżbieta BIEŃKOWSKA\n            \n\n\n\n\nMemb

Metadata filtering

In [112]:
# Before limiting to a specific year
results = test_collection.query(
    query_texts=["civil servants"],
    n_results=5
)

results["documents"]

[['EUR-Lex - 31999Y0917(02) - EN\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAvis juridique important\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\n\n\n31999Y0917(02)\n\n\n\n\nInformation from the Commission - Appointment of a new member of the Committee of Experts on the Transit of Electricity between Grids set up under Commission Decision 92/167/EEC (Text with EEA relevance)  \n\n\n\n\n\n\nOfficial Journal C 263 , 17/09/1999 P. 0003 - 0003\n \n\n\n\n\n\n\n\n\n\n\n\n\nAppointment of a new member of the Committee of Experts on the Transit of Electricity between Grids set up under Commission Decision 92/167/EEC\n(1999/C 263/03)\n(Text with EEA relevance)\nBy its decision of 13 September 1999 the Commission appointed to the position of new member of the Committee of Experts on the Transit of Electricity between Grids:\nas Spanish representative of the high-voltage grids,\nMr Victoriano Casajús, Director General of Transport, Red Electrica, to replace the outgoing Mr Novales.',
  'EUR-Lex - 32000R0212 - EN\n\n\n\

In [113]:
# After limiting to a specific year
results = test_collection.query(
    query_texts=["civil servants"],
    where={"celex-year": 2000},
    n_results=5
)

results["documents"]

[['EUR-Lex - 32000R0212 - EN\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAvis juridique important\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\n\n\n32000R0212\n\n\n\n\nCouncil Regulation (EC, ECSC, Euratom) No 212/2000 of 24 January 2000 adjusting the daily subsistence allowance rates for officials on mission in Austria, Finland and Sweden laid down in Article 13 of Annex VII to the Staff Regulations of Officials of the European Communities  \n\n\n\n\n\n\nOfficial Journal L 024 , 29/01/2000 P. 0001 - 0002',
  'COUNCIL REGULATION (EC, ECSC, EURATOM) No 212/2000\nof 24 January 2000\nadjusting the daily subsistence allowance rates for officials on mission in Austria, Finland and Sweden laid down in Article 13 of Annex VII to the Staff Regulations of Officials of the European Communities\nTHE COUNCIL OF THE EUROPEAN UNION,\nHaving regard to the Treaty establising the European Community, and in particular Article 283 thereof,\nHaving regard to the Staff Regulations of Officials and the Conditions of Employment of O

In [87]:
# results = collection.query(
#     query_texts=["Tropical"],
#     where={"$and": [
#         {"color": "yellow"},
#         {"weight": {"$gt": 150}}
#     ]},
#     n_results=5
# )

# print(results["documents"][0])

In [114]:
# Full text search
results = test_collection.get(
    where_document={"$contains": "Union Customs Code"}
)

results["documents"]

['L_2019148EN.01000101.xml\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n6.6.2019\xa0\xa0\xa0\n\n\n\n\n\n\nEN\n\n\n\n\n\n\nOfficial Journal of the European Union\n\n\n\n\n\n\nL 148/1\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n            COMMISSION IMPLEMENTING REGULATION (EU) 2019/921\n\n\nof 3\xa0June 2019\n         \n\n\nconcerning the classification of certain goods in the Combined Nomenclature\n\n\n\n\n\n\nTHE EUROPEAN COMMISSION,\n\n\n\n\nHaving regard to the Treaty on the Functioning of the European Union,\n\n\n\n\n\n\nHaving regard to Regulation (EU) No 952/2013 of the European Parliament and of the Council of 9\xa0October 2013 laying down the Union Customs Code\xa0\n(\n1\n)\n, and in particular Article 57(4) and Article 58(2) thereof,\n\n\n\n\nWhereas:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n(1)\n\n\n\n\n\n\nIn order to ensure uniform application of the Combined Nomenclature annexed to Council Regulation (EEC) No 2658/87\xa0\n(\n2\n)\n, it is necessary to adopt measures concerning the classif

# Embed all documents

In [13]:
log_filename = datetime.now().strftime("errors_%Y%m%d_%H%M%S.log")
logging.basicConfig(
    filename=log_filename,          # file to write logs
    level=logging.ERROR,            # log only errors and above
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [None]:
# ch_client.delete_collection(name="legal")
collection = ch_client.get_or_create_collection(name="legal")
documents = la_mtd.drop_duplicates('doc-path').reset_index(drop=True)
print(documents.shape[0], "documents to process")

46665 documents to process


## Loop
stop at 3067

In [16]:
for idx, row in tqdm(documents.iterrows(), desc="Processing documents", total=documents.shape[0]):
    # Load document from S3
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=row["doc-path"])
        html_content = obj['Body'].read().decode('utf-8')  # HTML as string
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")  # plain text
    except Exception as e:
        logging.error("%d | Error loading %s:\n%s", idx, row["doc-path"], traceback.format_exc())
        print(f"{idx} | Error loading {row['doc-path']} | {e}")
        continue
    
    # Extract metadata
    try:
        meta = {}
        if soup.title:
            meta['title'] = soup.title.string
        for m in soup.find_all("meta"):
            if m.get("name") and m.get("content"):
                meta[m["name"].lower()] = m["content"]
        meta_download = la_mtd[(la_mtd["work-id"] == row["work-id"]) & (la_mtd["doc"] == row["doc"])]
        terms = meta_download["TERMS (PT-NPT)"].dropna().astype(str).unique()
        meta['eurovoc-terms'] = ';'.join(terms)
        mts = meta_download["MT"].dropna().astype(str).unique()
        meta['eurovoc-mt'] = ';'.join(mts)

        sector, year = decompose_celex(row['celex']) if row['celex'] else (None, None)
        meta['celex'] = row['celex'] if row['celex'] else None
        meta['celex-sector'] = sector # a character
        meta['celex-year'] = int(year) # a 4-digit number
    except Exception as e:
        logging.error("%d | Error extracting metadata %s:\n%s", idx, row["doc-path"], traceback.format_exc())
        print(f"{idx} | Error extracting metadata {row['doc-path']} | {e}")


    # Split, embed, and store
    try:
        chunks = text_splitter.split_text(text)
        texts = [chunk for chunk in chunks]
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
        collection.add(
            documents=texts,
            metadatas=[meta for _ in range(len(texts))],
            embeddings=embeddings.embed_documents(texts),
            ids=[f"{make_id(row['doc-path'])}_{i}" for i in range(len(texts))]
        )
    except Exception as e:
        logging.error("%d | Error embedding %s:\n%s", idx, row["doc-path"], traceback.format_exc())
        print(f"{idx} | Error embedding {row['doc-path']} | {e}")
        continue

Processing documents:   0%|          | 165/46665 [29:10<90:08:02,  6.98s/it] ERROR:root:165 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:166 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0011.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing docu

165 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
166 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0011.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
167 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
168 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
169 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0003.xml.jpg | 'utf-8' codec can't d

ERROR:root:170 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 171/46665 [29:10<20:54:46,  1.62s/it]ERROR:root:171 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:172 |

170 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
171 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
172 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0027.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
173 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:174 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 175/46665 [29:11<9:26:02,  1.37it/s] ERROR:root:175 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0008.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:176 |

174 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
175 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
176 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0018.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
177 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:178 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0026.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 179/46665 [29:11<4:45:18,  2.72it/s]ERROR:root:179 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:180 | 

178 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0026.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
179 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
180 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
181 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:182 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0019.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 183/46665 [29:11<2:38:03,  4.90it/s]ERROR:root:183 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0009.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:184 | 

182 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0019.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
183 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
184 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
185 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001701.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:186 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:187 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0012.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 188/46665 [29:11<1:32:59,  8.33it/s]ERROR:root:188 | 

186 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
187 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
188 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   0%|          | 190/46665 [29:12<1:28:14,  8.78it/s]ERROR:root:190 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:191 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0009.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



189 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0024.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
190 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   0%|          | 192/46665 [29:12<1:34:15,  8.22it/s]ERROR:root:192 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0010.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:193 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing docume

191 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
192 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
193 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
194 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001701.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
195 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0009.xml.jpg | 'utf-8' codec can't d

ERROR:root:196 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0010.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 197/46665 [29:12<1:01:32, 12.58it/s]ERROR:root:197 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0012.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:198 | 

196 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
197 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
198 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
199 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001701.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:200 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0008.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 201/46665 [29:12<52:27, 14.76it/s]ERROR:root:201 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:202 | Er

200 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
201 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
202 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
203 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
204 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0011.xml.jpg | 'utf-8' codec can't d

ERROR:root:205 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 206/46665 [29:13<43:48, 17.68it/s]ERROR:root:206 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0025.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:207 | Er

205 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
206 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0025.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
207 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
208 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
209 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0001.xml.jpg | 'utf-8' codec can't d

ERROR:root:210 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0011.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:211 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 212/46665 [29:13<45:24, 17.05it/s]ERROR:root:212 | Er

210 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0011.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
211 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
212 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
213 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0030.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:214 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0020.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:215 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:216 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0

214 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0020.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
215 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
216 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0016.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
217 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
218 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0014.xml.jpg | 'utf-8' codec can't d

ERROR:root:219 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 220/46665 [29:13<38:21, 20.18it/s]ERROR:root:220 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0029.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:221 | Er

219 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
220 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0029.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
221 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
222 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0015.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
223 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0007.xml.jpg | 'utf-8' codec can't d

ERROR:root:224 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0017.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:225 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   0%|          | 226/46665 [29:14<38:12, 20.26it/s]ERROR:root:226 | Er

224 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0017.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
225 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000101.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
226 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0028.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
227 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
228 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0021.xml.jpg | 'utf-8' codec can't d

ERROR:root:229 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0031.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:230 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:231 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0

229 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0031.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
230 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
231 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002401.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
232 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0017.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
233 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0007.xml.jpg | 'utf-8' codec can't d

Processing documents:   1%|          | 234/46665 [29:14<46:21, 16.69it/s]ERROR:root:234 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0015.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:235 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing document

234 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0015.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
235 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
236 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001701.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
237 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:238 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:239 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0023.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   1%|          | 240/46665 [29:15<54:30, 14.20it/s]ERROR:root:240 | Er

238 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
239 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0023.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
240 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
241 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01002101.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
242 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0011.xml.jpg | 'utf-8' codec can't d

ERROR:root:244 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   1%|          | 245/46665 [29:22<12:57:17,  1.00s/it]ERROR:root:245 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0022.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:246 |

244 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
245 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0022.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
246 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:247 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0014.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:248 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   1%|          | 249/46665 [29:22<7:16:46,  1.77it/s]ERROR:root:249 | 

247 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000501.notes.0014.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
248 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
249 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01001301.notes.0016.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:250 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0008.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   1%|          | 251/46665 [29:23<5:32:31,  2.33it/s]

250 | Error loading eu/LEG_EN_HTML_20250721_04_08/e9cc666d-ab4d-4716-a3e8-df42fc0e0747/xhtml/FOR-C_2007070EN.01000901.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   1%|          | 470/46665 [1:15:07<126:41:48,  9.87s/it]  ERROR:root:470 | Error loading eu/LEG_EN_HTML_20250721_04_08/7d563afe-3f24-4d51-97d6-5bd889c3606c/xhtml/FOR-C_2006082EN.01003101.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   1%|          | 471/46665 [1:15:07<89:16:01,  6.96s/it] 

470 | Error loading eu/LEG_EN_HTML_20250721_04_08/7d563afe-3f24-4d51-97d6-5bd889c3606c/xhtml/FOR-C_2006082EN.01003101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   1%|          | 472/46665 [1:15:15<93:05:50,  7.26s/it]ERROR:root:472 | Error loading eu/LEG_EN_HTML_20250721_04_08/7d563afe-3f24-4d51-97d6-5bd889c3606c/xhtml/FOR-C_2006082EN.01005201.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   1%|          | 473/46665 [1:15:15<65:50:46,  5.13s/it]ERROR:root:473 | Error loading eu/LEG_EN_HTML_20250721_04_08/7d563afe-3f24-4d51-97d6-5bd889c3606c/xhtml/FOR-C_2006082EN.01004401.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' 

472 | Error loading eu/LEG_EN_HTML_20250721_04_08/7d563afe-3f24-4d51-97d6-5bd889c3606c/xhtml/FOR-C_2006082EN.01005201.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
473 | Error loading eu/LEG_EN_HTML_20250721_04_08/7d563afe-3f24-4d51-97d6-5bd889c3606c/xhtml/FOR-C_2006082EN.01004401.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   2%|▏         | 857/46665 [2:36:03<106:42:14,  8.39s/it]  ERROR:root:857 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01001701.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 858/46665 [2:36:03<75:27:27,  5.93s/it] ERROR:root:858 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01000501.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf

857 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01001701.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
858 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01000501.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
859 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01004101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
860 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01001301.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:861 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01001701.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 862/46665 [2:36:03<25:00:49,  1.97s/it]

861 | Error loading eu/LEG_EN_HTML_20250721_04_08/58708103-2a78-42e0-beaa-3f88048f23df/xhtml/FOR-C_2006055EN.01001701.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   2%|▏         | 915/46665 [2:47:56<104:53:07,  8.25s/it] ERROR:root:915 | Error loading eu/LEG_EN_HTML_20250721_04_08/a1c35c51-a6bb-11e8-99ee-01aa75ed71a1/xhtml/MTF_2018_9008_FIN_ENG_xhtml_01001.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 916/46665 [2:47:56<74:07:30,  5.83s/it] 

915 | Error loading eu/LEG_EN_HTML_20250721_04_08/a1c35c51-a6bb-11e8-99ee-01aa75ed71a1/xhtml/MTF_2018_9008_FIN_ENG_xhtml_01001.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   2%|▏         | 1016/46665 [3:34:57<108:30:40,  8.56s/it]  ERROR:root:1016 | Error embedding eu/LEG_EN_HTML_20250721_04_08/894f347a-dd2d-11ee-b9d9-01aa75ed71a1/xhtml/L_202400348EN.html:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 41, in <module>
    collection.add(
  File "/Users/demouser/anaconda3/envs/regpy10ch/lib/python3.10/site-packages/chromadb/api/models/Collection.py", line 89, in add
    self._client._add(
  File "/Users/demouser/anaconda3/envs/regpy10ch/lib/python3.10/site-packages/chromadb/api/rust.py", line 407, in _add
    return self.bindings.add(
chromadb.errors.InternalError: ValueError: Batch size of 33719 is greater than max batch size of 5461

Processing documents:   2%|▏         | 1017/46665 [5:42:32<29193:34:32, 2302.33s/it]

1016 | Error embedding eu/LEG_EN_HTML_20250721_04_08/894f347a-dd2d-11ee-b9d9-01aa75ed71a1/xhtml/L_202400348EN.html | ValueError: Batch size of 33719 is greater than max batch size of 5461


Processing documents:   2%|▏         | 1025/46665 [5:46:06<1951:43:10, 153.95s/it]  ERROR:root:1025 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1026 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0013.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Proces

1025 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1026 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1027 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1028 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1029 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1030/46665 [5:46:06<463:34:35, 36.57s/it]

1028 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1029 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   2%|▏         | 1031/46665 [5:46:12<373:37:20, 29.47s/it]ERROR:root:1031 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0008.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1032/46665 [5:46:12<282:06:18, 22.26s/it]

1031 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1032 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1033/46665 [5:46:12<209:32:28, 16.53s/it]ERROR:root:1033 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0012.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root

1032 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1033 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1034 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1035 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1036 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0010.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1037/46665 [5:46:12<75:14:24,  5.94s/it] ERROR:root:1037 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root

1036 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1037 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   2%|▏         | 1039/46665 [5:46:13<49:18:16,  3.89s/it]ERROR:root:1039 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



1038 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1039 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1040 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1041/46665 [5:46:13<33:21:57,  2.63s/it]ERROR:root:1041 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

1040 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1041 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1042 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0011.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1043 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1044 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1045/46665 [5:46:13<15:59:19,  1.26s/it]ERROR:root:

1043 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1044 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1045 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1046 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1047/46665 [5:46:13<11:18:49,  1.12it/s]ERROR:root:1047 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

1046 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1047 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1048 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0014.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1049 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1050 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1051/46665 [5:46:14<5:55:42,  2.14it/s]ERROR:root:1

1049 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1050 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1051 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1052 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0015.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1053 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1054 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1055/46665 [5:46:14<3:29:17,  3.63it/s]ERROR:root:1

1053 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1054 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1055 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1056 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1057/46665 [5:46:14<2:54:05,  4.37it/s]ERROR:root:1057 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1

1056 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1057 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001901.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1058 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001501.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1059 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1060 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1061/46665 [5:46:15<2:10:44,  5.81it/s]ERROR:root:1

1059 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000201.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1060 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01000701.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1061 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1062 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0016.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   2%|▏         | 1063/46665 [5:46:15<1:47:04,  7.10it/s]

1062 | Error loading eu/LEG_EN_HTML_20250721_04_08/3a8bb2db-4f56-4fc5-935f-742c73709883/xhtml/FOR-C_2007089EN.01001101.notes.0016.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   3%|▎         | 1330/46665 [6:43:37<221:44:45, 17.61s/it]ERROR:root:1330 | Error loading eu/LEG_EN_HTML_20250721_04_08/1ffa1590-8958-11e7-b5c6-01aa75ed71a1/xhtml/C_2017281EN.01001402.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



1330 | Error loading eu/LEG_EN_HTML_20250721_04_08/1ffa1590-8958-11e7-b5c6-01aa75ed71a1/xhtml/C_2017281EN.01001402.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   3%|▎         | 1375/46665 [6:50:58<119:58:27,  9.54s/it]ERROR:root:1375 | Error loading eu/LEG_EN_HTML_20250721_04_08/de17c0c7-ba3d-11e8-99ee-01aa75ed71a1/xhtml/C_2018327EN.01000601.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   3%|▎         | 1376/46665 [6:50:58<84:36:42,  6.73s/it] 

1375 | Error loading eu/LEG_EN_HTML_20250721_04_08/de17c0c7-ba3d-11e8-99ee-01aa75ed71a1/xhtml/C_2018327EN.01000601.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   3%|▎         | 1395/46665 [6:58:37<195:52:15, 15.58s/it]ERROR:root:1395 | Error extracting metadata eu/LEG_EN_HTML_20250721_04_08/a4a13718-61e1-11f0-bf4e-01aa75ed71a1/xhtml/C_202503941EN.html:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 27, in <module>
    sector, year = decompose_celex(row['celex']) if row['celex'] else (None, None)
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/2677490457.py", line 19, in decompose_celex
    sector = celex[0]
TypeError: 'float' object is not subscriptable



1395 | Error extracting metadata eu/LEG_EN_HTML_20250721_04_08/a4a13718-61e1-11f0-bf4e-01aa75ed71a1/xhtml/C_202503941EN.html | 'float' object is not subscriptable


Processing documents:   3%|▎         | 1545/46665 [7:23:34<149:12:31, 11.90s/it]ERROR:root:1545 | Error extracting metadata eu/LEG_EN_HTML_20250721_04_08/5290cd79-62a6-11f0-bf4e-01aa75ed71a1/xhtml/L_202501430EN.html:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 27, in <module>
    sector, year = decompose_celex(row['celex']) if row['celex'] else (None, None)
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/2677490457.py", line 19, in decompose_celex
    sector = celex[0]
TypeError: 'float' object is not subscriptable



1545 | Error extracting metadata eu/LEG_EN_HTML_20250721_04_08/5290cd79-62a6-11f0-bf4e-01aa75ed71a1/xhtml/L_202501430EN.html | 'float' object is not subscriptable


Processing documents:   4%|▎         | 1695/46665 [8:38:05<168:38:27, 13.50s/it]  ERROR:root:1695 | Error loading eu/LEG_EN_HTML_20250721_04_08/2b3a2a49-a92f-11e9-9d01-01aa75ed71a1/xhtml/C_2019242EN.01000901.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▎         | 1696/46665 [8:38:05<118:51:04,  9.51s/it]ERROR:root:1696 | Error loading eu/LEG_EN_HTML_20250721_04_08/2b3a2a49-a92f-11e9-9d01-01aa75ed71a1/xhtml/C_2019242EN.01001001.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byt

1695 | Error loading eu/LEG_EN_HTML_20250721_04_08/2b3a2a49-a92f-11e9-9d01-01aa75ed71a1/xhtml/C_2019242EN.01000901.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1696 | Error loading eu/LEG_EN_HTML_20250721_04_08/2b3a2a49-a92f-11e9-9d01-01aa75ed71a1/xhtml/C_2019242EN.01001001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▎         | 1736/46665 [8:46:50<347:45:50, 27.87s/it]ERROR:root:1736 | Error loading eu/LEG_EN_HTML_20250721_04_08/05dcec40-bade-11e7-a7f8-01aa75ed71a1/xhtml/C_2017364EN.01001302.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▎         | 1737/46665 [8:46:51<244:21:07, 19.58s/it]ERROR:root:1737 | Error loading eu/LEG_EN_HTML_20250721_04_08/05dcec40-bade-11e7-a7f8-01aa75ed71a1/xhtml/C_2017364EN.01003402.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 

1736 | Error loading eu/LEG_EN_HTML_20250721_04_08/05dcec40-bade-11e7-a7f8-01aa75ed71a1/xhtml/C_2017364EN.01001302.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▎         | 1738/46665 [8:46:51<171:57:16, 13.78s/it]ERROR:root:1738 | Error loading eu/LEG_EN_HTML_20250721_04_08/05dcec40-bade-11e7-a7f8-01aa75ed71a1/xhtml/C_2017364EN.01001301.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▎         | 1739/46665 [8:46:51<120:59:11,  9.69s/it]

1737 | Error loading eu/LEG_EN_HTML_20250721_04_08/05dcec40-bade-11e7-a7f8-01aa75ed71a1/xhtml/C_2017364EN.01003402.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1738 | Error loading eu/LEG_EN_HTML_20250721_04_08/05dcec40-bade-11e7-a7f8-01aa75ed71a1/xhtml/C_2017364EN.01001301.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▍         | 1776/46665 [8:52:59<174:21:38, 13.98s/it]ERROR:root:1776 | Error loading eu/LEG_EN_HTML_20250721_04_08/5e152044-a6be-11e8-99ee-01aa75ed71a1/xhtml/MTF_2018_8978_FIN_ENG_xhtml_01001.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1777/46665 [8:52:59<123:05:11,  9.87s/it]

1776 | Error loading eu/LEG_EN_HTML_20250721_04_08/5e152044-a6be-11e8-99ee-01aa75ed71a1/xhtml/MTF_2018_8978_FIN_ENG_xhtml_01001.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▍         | 1784/46665 [8:54:00<123:06:11,  9.87s/it]ERROR:root:1784 | Error loading eu/LEG_EN_HTML_20250721_04_08/ef1c174d-b49d-11e7-837e-01aa75ed71a1/xhtml/C_2017351EN.01000503.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1785/46665 [8:54:00<87:03:20,  6.98s/it] 

1784 | Error loading eu/LEG_EN_HTML_20250721_04_08/ef1c174d-b49d-11e7-837e-01aa75ed71a1/xhtml/C_2017351EN.01000503.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▍         | 1859/46665 [9:06:56<126:35:19, 10.17s/it]ERROR:root:1859 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006202.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1860/46665 [9:06:57<89:29:24,  7.19s/it] ERROR:root:1860 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005702.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 

1859 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006202.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1860 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005702.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1861 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003902.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1862/46665 [9:06:57<44:32:58,  3.58s/it]ERROR:root:1862 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003501.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1863 | Error loading eu/LEG_EN

1861 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003902.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1862 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003501.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1863 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004302.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1864 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006301.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1865/46665 [9:06:57<18:50:29,  1.51s/it]ERROR:root:1865 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01001901.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1866 | Error loading eu/LEG_EN

1864 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006301.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1865 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01001901.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1866 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004503.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1867 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01011001.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1868/46665 [9:06:57<9:25:50,  1.32it/s] ERROR:root:1868 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005601.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 18

1867 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01011001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1868 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005601.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1869 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006403.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1870/46665 [9:06:58<5:59:22,  2.08it/s]ERROR:root:1870 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1871 | Error loading eu/LEG_EN_

1869 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006403.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1870 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1871 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013603.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1872 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003402.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1873/46665 [9:06:58<3:16:39,  3.80it/s]ERROR:root:1873 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003801.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 187

1872 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003402.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1873 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003801.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1874 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005916.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1875 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005906.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1876/46665 [9:06:58<2:05:27,  5.95it/s]ERROR:root:1876 | Error loading eu/LEG_EN_

1874 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005916.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1875 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005906.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1876 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002701.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1877 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006004.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1878 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004502.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1879 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003301.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1880/46665 [9:06:59<1:34:27,  7.90it/s]ERROR:root:1880 | Error loading eu/LEG_EN_

1878 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004502.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1879 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003301.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1880 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006402.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1881 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006801.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1882/46665 [9:06:59<1:24:46,  8.80it/s]ERROR:root:1882 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005907.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1883 | Error loading eu/LEG_EN_

1881 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006801.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1882 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005907.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1883 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013602.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1884 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005001.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1885 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006203.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1886/46665 [9:06:59<1:14:15, 10.05it/s]ERROR:root:1886 | Error loading eu/LEG_EN_

1884 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1885 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006203.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1886 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01010201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1887 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006501.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1888/46665 [9:06:59<1:08:40, 10.87it/s]ERROR:root:1888 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003604.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1889 | Error loading eu/LEG_EN_

1887 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006501.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1888 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003604.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1889 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013701.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1891 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006302.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1892/46665 [9:07:00<1:05:00, 11.48it/s]ERROR:root:1892 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005915.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



1890 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003903.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1891 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006302.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1892 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005915.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1893 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005905.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1894/46665 [9:07:00<1:02:32, 11.93it/s]ERROR:root:1894 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003802.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1895 | Error loading eu/LEG_EN_

1893 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005905.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1894 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003802.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1895 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004202.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1896 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003401.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1897 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1898/46665 [9:07:00<1:03:52, 11.68it/s]ERROR:root:1898 | Error loading eu/LEG_EN_

1896 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003401.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1897 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1898 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003502.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1899 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004301.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1900/46665 [9:07:00<1:00:54, 12.25it/s]ERROR:root:1900 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003901.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1901 | Error loading eu/LEG_EN_

1899 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004301.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1900 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003901.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1901 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005701.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1902 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006104.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



1902 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006104.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▍         | 1904/46665 [9:08:41<189:25:13, 15.23s/it]ERROR:root:1904 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1905/46665 [9:08:42<156:23:59, 12.58s/it]ERROR:root:1905 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01010202.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 

1904 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1905 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01010202.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1906 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004801.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1907/46665 [9:08:42<104:37:24,  8.42s/it]ERROR:root:1907 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003503.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1908 | Error loading eu/LEG_E

1906 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004801.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1907 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003503.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1908 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006901.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1909 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004501.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1910/46665 [9:08:42<58:09:46,  4.68s/it]ERROR:root:1910 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005101.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 19

1909 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004501.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1910 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005101.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1911 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006303.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1912 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013601.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1913/46665 [9:08:43<29:15:21,  2.35s/it]ERROR:root:1913 | Error loading eu/LEG_EN

1911 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006303.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1912 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013601.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1913 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004203.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1914 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005904.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1915/46665 [9:08:43<19:17:29,  1.55s/it]ERROR:root:1915 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005914.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1916 | Error loading eu/LEG_EN

1914 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005904.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1915 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005914.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1916 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006401.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1917 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01012201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1918 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013604.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1919/46665 [9:08:43<9:18:01,  1.34it/s] ERROR:root:1919 | Error loading eu/LEG_EN

1917 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01012201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1918 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013604.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1919 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005911.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1920 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005901.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1921 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003701.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1922 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004102.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1923/46665 [9:08:43<4:48:08,  2.59it/s]ERROR:root:1923 | Error loading eu/LEG_EN_

1921 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003701.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1922 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004102.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1923 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006002.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1924 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003003.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1925 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005401.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1926 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004001.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1927/46665 [9:08:44<2:50:01,  4.39it/s]ERROR:root:1927 | Error loading eu/LEG_EN_

1925 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005401.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1926 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1927 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003602.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1928 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005908.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1929/46665 [9:08:44<2:19:40,  5.34it/s]ERROR:root:1929 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006101.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1930 | Error loading eu/LEG_EN_

1928 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005908.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1929 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006101.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1930 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002903.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1931 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013301.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1932 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005909.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1933 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003603.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1934/46665 [9:08:44<1:25:22,  8.73it/s]ERROR:root:1934 | Error loading eu/LEG_EN_

1932 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005909.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1933 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003603.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1934 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002902.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1935 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003101.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1936 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006204.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1937 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01008201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1938/46665 [9:08:45<1:27:58,  8.47it/s]ERROR:root:1938 | Error loading eu/LEG_EN_

1936 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006204.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1937 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01008201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1938 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005910.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1939 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006701.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1940/46665 [9:08:45<1:23:09,  8.96it/s]ERROR:root:1940 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013501.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1941 | Error loading eu/LEG_EN_

1939 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006701.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1940 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013501.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1941 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1942 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006003.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1943 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002801.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1944/46665 [9:08:45<1:15:10,  9.91it/s]ERROR:root:1944 | Error loading eu/LEG_EN_

1942 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006003.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1943 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002801.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1944 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003002.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1945 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01010001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1946 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004601.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1947 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004002.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1948/46665 [9:08:46<1:14:29, 10.00it/s]ERROR:root:1948 | Error loading eu/LEG_EN_

1946 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004601.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1947 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004002.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1948 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003601.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1949 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005801.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1950/46665 [9:08:46<1:12:53, 10.22it/s]ERROR:root:1950 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006102.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1951 | Error loading eu/LEG_EN_

1949 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005801.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1950 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006102.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1951 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003702.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1952 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004101.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1953 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005912.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1954/46665 [9:08:46<1:03:58, 11.65it/s]ERROR:root:1954 | Error loading eu/LEG_EN_

1952 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004101.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1953 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005912.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1954 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005902.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1955 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005501.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1956/46665 [9:08:46<1:16:27,  9.75it/s]ERROR:root:1956 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006001.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1957 | Error loading eu/LEG_EN_

1955 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005501.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1956 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1957 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003703.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1958 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004204.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1959 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005903.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1960/46665 [9:08:47<1:05:21, 11.40it/s]ERROR:root:1960 | Error loading eu/LEG_EN_

1958 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004204.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1959 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005903.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1960 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005913.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1961 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003001.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1962 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006304.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1963 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002802.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1964/46665 [9:08:47<1:01:44, 12.07it/s]ERROR:root:1964 | Error loading eu/LEG_EN_

1962 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006304.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1963 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002802.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1964 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01013401.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1965 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004003.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1966 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01007201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1967 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003504.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1968/46665 [9:08:47<58:32, 12.72it/s]ERROR:root:1968 | Error loading eu/LEG_EN_HT

1966 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01007201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1967 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003504.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1968 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006601.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1969 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01008202.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1970/46665 [9:08:48<1:11:10, 10.47it/s]

1969 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01008202.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1970 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004701.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:1971 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003102.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1972/46665 [9:08:48<1:26:10,  8.64it/s]ERROR:root:1972 | Error loading eu/LEG_EN_

1970 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01004701.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1971 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01003102.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1972 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01010101.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1973 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01011501.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1974/46665 [9:08:48<1:42:22,  7.28it/s]

1973 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01011501.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1974 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005301.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 1975/46665 [9:08:49<1:56:16,  6.41it/s]ERROR:root:1975 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006103.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 197

1974 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01005301.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1975 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01006103.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:1976 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002901.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



1976 | Error loading eu/LEG_EN_HTML_20250721_04_08/c277d948-f0d0-11eb-a71c-01aa75ed71a1/xhtml/L_2021273EN.01002901.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▍         | 2019/46665 [9:16:50<79:17:56,  6.39s/it] ERROR:root:2019 | Error loading eu/LEG_EN_HTML_20250721_04_08/dbe27cc5-8142-11eb-9ac9-01aa75ed71a1/xhtml/CI2021081EN.01001101.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2020 | Error loading eu/LEG_EN_HTML_20250721_04_08/dbe27cc5-8142-11eb-9ac9-01aa75ed71a1/xhtml/CI2021081EN.01001201.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 2

2019 | Error loading eu/LEG_EN_HTML_20250721_04_08/dbe27cc5-8142-11eb-9ac9-01aa75ed71a1/xhtml/CI2021081EN.01001101.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2020 | Error loading eu/LEG_EN_HTML_20250721_04_08/dbe27cc5-8142-11eb-9ac9-01aa75ed71a1/xhtml/CI2021081EN.01001201.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   4%|▍         | 2022/46665 [9:17:03<73:56:32,  5.96s/it]ERROR:root:2022 | Error loading eu/LEG_EN_HTML_20250721_04_08/fc6da850-a6bb-11e8-99ee-01aa75ed71a1/xhtml/MTF_2018_9039_FIN_ENG_xhtml_01001.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   4%|▍         | 2023/46665 [9:17:04<56:05:54,  4.52s/it]

2022 | Error loading eu/LEG_EN_HTML_20250721_04_08/fc6da850-a6bb-11e8-99ee-01aa75ed71a1/xhtml/MTF_2018_9039_FIN_ENG_xhtml_01001.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   5%|▍         | 2251/46665 [10:07:34<156:29:45, 12.68s/it]ERROR:root:2251 | Error loading eu/LEG_EN_HTML_20250721_04_08/acd71941-42b9-4adf-bdf6-6c840fc67d32/xhtml/L_1989156EN.01001702.tif.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



2251 | Error loading eu/LEG_EN_HTML_20250721_04_08/acd71941-42b9-4adf-bdf6-6c840fc67d32/xhtml/L_1989156EN.01001702.tif.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   5%|▌         | 2481/46665 [11:08:17<58:48:24,  4.79s/it]   ERROR:root:2481 | Error loading eu/LEG_EN_HTML_20250721_04_08/c59e1997-04b4-4360-885a-b7e86198b1b5/xhtml/FOR-C_2007182EN.01000101.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2482 | Error loading eu/LEG_EN_HTML_20250721_04_08/c59e1997-04b4-4360-885a-b7e86198b1b5/xhtml/FOR-C_2007182EN.01000101.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:r

2481 | Error loading eu/LEG_EN_HTML_20250721_04_08/c59e1997-04b4-4360-885a-b7e86198b1b5/xhtml/FOR-C_2007182EN.01000101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2482 | Error loading eu/LEG_EN_HTML_20250721_04_08/c59e1997-04b4-4360-885a-b7e86198b1b5/xhtml/FOR-C_2007182EN.01000101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2483 | Error loading eu/LEG_EN_HTML_20250721_04_08/c59e1997-04b4-4360-885a-b7e86198b1b5/xhtml/FOR-C_2007182EN.01000101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   6%|▌         | 2638/46665 [11:34:59<67:34:41,  5.53s/it] ERROR:root:2638 | Error extracting metadata eu/LEG_EN_HTML_20250721_04_08/76962ce3-52ef-11f0-a9d0-01aa75ed71a1/xhtml/L_202501248EN.html:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 27, in <module>
    sector, year = decompose_celex(row['celex']) if row['celex'] else (None, None)
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/2677490457.py", line 19, in decompose_celex
    sector = celex[0]
TypeError: 'float' object is not subscriptable



2638 | Error extracting metadata eu/LEG_EN_HTML_20250721_04_08/76962ce3-52ef-11f0-a9d0-01aa75ed71a1/xhtml/L_202501248EN.html | 'float' object is not subscriptable


Processing documents:   6%|▌         | 2650/46665 [11:36:38<63:52:40,  5.22s/it] ERROR:root:2650 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2651 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:roo

2650 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2651 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2652 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0016.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2653 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0020.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2654 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2655/46665 [11:36:39<18:42:39,  1.53s/it]ERROR:root:2655 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root

2654 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2655 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2656 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2657 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2659 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0017.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2660/46665 [11:36:39<7:43:26,  1.58it/s] ERROR:root:2660 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



2658 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2659 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0017.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2660 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2661 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0008.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2662/46665 [11:36:39<5:46:11,  2.12it/s]ERROR:root:2662 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

2661 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2662 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2663 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2664 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2665 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0015.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2666/46665 [11:36:39<3:17:28,  3.71it/s]ERROR:root:2666 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0014.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

2665 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0015.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2666 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0014.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2667 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2668 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2669 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01001301.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2670 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0018.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2671/46665 [11:36:40<1:51:20,  6.59it/s]ERROR:root:

2669 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01001301.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2670 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0018.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2671 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2672 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2673 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2674 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0011.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2675/46665 [11:36:40<1:17:14,  9.49it/s]ERROR:root:

2673 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2674 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0011.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2675 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2676 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2677 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2678 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2679/46665 [11:36:40<1:06:05, 11.09it/s]ERROR:root:

2677 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2678 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2679 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01001301.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2680 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2681 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0019.xml.jpg | 'utf-8' codec ca

ERROR:root:2682 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2683 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2684/46665 [11:36:40<50:29, 14.52it/s]ERROR:root:26

2682 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2683 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2684 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2685 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2686 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0003.xml.jpg | 'utf-8' codec ca

ERROR:root:2687 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01001301.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



2687 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01001301.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   6%|▌         | 2689/46665 [11:36:47<10:49:51,  1.13it/s]ERROR:root:2689 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2690 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing

2689 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000501.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2690 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2691 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2692 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01002001.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2693 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte



2693 | Error loading eu/LEG_EN_HTML_20250721_04_08/853f69fe-0226-4a67-b554-2dfb4d43c5e5/xhtml/FOR-C_2007072EN.01000901.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   6%|▌         | 2782/46665 [11:49:34<108:01:47,  8.86s/it]ERROR:root:2782 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2783 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processin

2782 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2783 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2784 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0016.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2785 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0020.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2786/46665 [11:49:34<35:38:44,  2.92s/it]ERROR:root:2786 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing

2785 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0020.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2786 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2787 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2788 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2789/46665 [11:49:34<17:38:28,  1.45s/it]ERROR:root

2787 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2788 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2789 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2790 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2791/46665 [11:49:34<11:38:57,  1.05it/s]ERROR:root:2791 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0017.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root

2790 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2791 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0017.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2792 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2793 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0008.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2794 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0004.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2795/46665 [11:49:35<5:48:32,  2.10it/s]ERROR:root:

2793 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2794 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2795 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2796 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2797/46665 [11:49:35<4:20:33,  2.81it/s]ERROR:root:2797 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0015.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

2796 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2797 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0015.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2798 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0014.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2799 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2800 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2801/46665 [11:49:35<2:31:45,  4.82it/s]ERROR:root:2801 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01001301.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

2800 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2801 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01001301.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2802 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0018.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2803 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2804 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2805 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2806/46665 [11:49:36<1:31:55,  7.95it/s]ERROR:root:

2804 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2805 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2806 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0011.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   6%|▌         | 2808/46665 [11:49:40<8:26:04,  1.44it/s]ERROR:root:2808 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2809 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0010.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing 

2808 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2809 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2810 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2811 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2812 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01001301.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2813 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0009.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2814/46665 [11:49:40<3:41:52,  3.29it/s]ERROR:root:

2812 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01001301.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2813 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2814 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0019.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2815 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2816 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2817/46665 [11:49:41<2:33:06,  4.77it/s]ERROR:root:2817 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0012.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

2816 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2817 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2818 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2819 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2820 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01001301.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2821 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2822/46665 [11:49:41<1:40:37,  7.26it/s]ERROR:root:

2820 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01001301.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2821 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000501.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2822 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2823 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:2824 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:2825 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▌         | 2826/46665 [11:49:41<1:12:38, 10.06it/s]

2824 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01002001.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
2825 | Error loading eu/LEG_EN_HTML_20250721_04_08/9b85eb16-60d1-4a32-9134-f760a9ad923d/xhtml/FOR-C_2007072EN.01000901.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   6%|▋         | 3020/46665 [12:49:03<107:20:00,  8.85s/it]  ERROR:root:3020 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3021 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Process

3020 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3021 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3022 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0016.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3023 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0020.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3024 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0004.xml.jpg | 'utf-8' codec ca

ERROR:root:3025 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3026 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▋         | 3027/46665 [12:49:04<20:18:28,  1.68s/it]

3025 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3026 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3028 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▋         | 3029/46665 [12:49:13<31:14:40,  2.58s/it]ERROR:root:3029 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing

3028 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3029 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3030 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0017.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3031 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0007.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   6%|▋         | 3032/46665 [12:49:13<17:15:09,  1.42s/it]ERROR:root

3030 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0017.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3031 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3032 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3033 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3034 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3035 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3036/46665 [12:49:13<8:39:47,  1.40it/s] 

3034 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3035 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3036 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0015.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3037/46665 [12:49:14<7:17:13,  1.66it/s]ERROR:root:3037 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0014.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:

3036 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0015.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3037 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0014.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3038 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3039 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3040 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01001301.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3041 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0018.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3042/46665 [12:49:14<3:07:45,  3.87it/s]ERROR:root:

3040 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01001301.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3041 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0018.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3042 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0008.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3043 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3044 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0001.xml.jpg | 'utf-8' codec ca

ERROR:root:3045 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0011.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3046 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0005.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3047/46665 [12:49:14<1:46:44,  6.81it/s]ERROR:root:

3045 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0011.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3046 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0005.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3047 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0010.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3048 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0004.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3049 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3050 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01001301.notes.0003.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3051/46665 [12:49:14<1:21:35,  8.91it/s]ERROR:root:

3049 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3050 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01001301.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3051 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0009.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   7%|▋         | 3053/46665 [12:49:14<1:09:21, 10.48it/s]ERROR:root:3053 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3054 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0002.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing 

3052 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0019.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3053 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3054 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0002.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


ERROR:root:3055 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0012.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3056 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0006.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3057/46665 [12:49:15<1:00:55, 11.93it/s]ERROR:root:

3055 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0012.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3056 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0006.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3057 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3058 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01001301.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3059 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000501.notes.0002.xml.jpg | 'utf-8' codec ca

ERROR:root:3060 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0001.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ERROR:root:3061 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0013.xml.jpg:
Traceback (most recent call last):
  File "/var/folders/zj/t8mz7g111911kcsdk6nwh8g00000gn/T/ipykernel_1963/64700856.py", line 5, in <module>
    html_content = obj['Body'].read().decode('utf-8')  # HTML as string
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Processing documents:   7%|▋         | 3062/46665 [12:49:15<47:31, 15.29it/s]ERROR:root:30

3060 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000101.notes.0001.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3061 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0013.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3062 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01002001.notes.0003.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
3063 | Error loading eu/LEG_EN_HTML_20250721_04_08/e6e6ee52-e3b7-4196-a846-6b8dffa5db52/xhtml/FOR-C_2007072EN.01000901.notes.0007.xml.jpg | 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


Processing documents:   7%|▋         | 3067/46665 [12:50:04<182:26:47, 15.07s/it]


KeyboardInterrupt: 

## Parallel

In [7]:
def parallel_extract(batch_func, df, collection, num_processes=4):
    chunks = np.array_split(df, num_processes)
    args = [(chunk, collection) for chunk in chunks]

    with Pool(processes=num_processes) as pool:
        pool.starmap(batch_func, args)

In [None]:
from helper.chroma import process_documents_batch
parallel_extract(process_documents_batch, documents, collection, num_processes=4)

In [10]:
from concurrent.futures import ThreadPoolExecutor

def parallel_extract_threads(batch_func, df, collection, num_threads=4):
    chunks = np.array_split(df, num_threads)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(batch_func, chunk, collection) for chunk in chunks]
        results = [f.result() for f in futures]

In [11]:
from helper.chroma import process_documents_batch
parallel_extract_threads(process_documents_batch, documents, collection, num_threads=4)

  return bound(*args, **kwds)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Users/demouser/anaconda3/envs/regpy10ch/lib/python3.10/threading.py", line 973, in _bootstrap
    self._bootstrap_inner()
  File "/Users/demouser/anaconda3/envs/regpy10ch/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/Users/demouser/anaconda3/envs/regpy10ch/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/Users/demouser/anaconda3/envs/regpy10ch/lib/python3.10/threading.py", line 953, in run
    self._target(

[99737] 3 | Error processing eu/LEG_EN_HTML_20250721_04_08/39bcdc85-2e3b-4e36-a488-5197ee502afd/html/31999Y0917_02_en.html | Numpy is not available
[99737] 2 | Error processing eu/LEG_EN_HTML_20250721_04_08/65a9ddff-3e78-403b-ab34-98d7973be2b3/xhtml/L_2010117EN.01006001.doc.html | Numpy is not available
[99737] 4 | Error processing eu/LEG_EN_HTML_20250721_04_08/a4d26b15-882d-11e9-9369-01aa75ed71a1/xhtml/L_2019148EN.01000101.doc.html | Numpy is not available
[99737] 0 | Error processing eu/LEG_EN_HTML_20250721_04_08/1a1e8486-a474-11e9-9d01-01aa75ed71a1/xhtml/L_2019187EN.01004101.doc.html | Numpy is not available
[99737] 1 | Error processing eu/LEG_EN_HTML_20250721_04_08/5fa72f58-9564-4ebe-a5a5-853e206ae2ed/html/32000R0212en.html | Numpy is not available


# Check chroma client

In [20]:
print("Number of embeddings:", collection.count())
results = collection.peek()
results['ids']

Number of embeddings: 118643


['9000786f2ac2_0',
 '9000786f2ac2_1',
 '9000786f2ac2_2',
 '9000786f2ac2_3',
 '9000786f2ac2_4',
 '9000786f2ac2_5',
 '9000786f2ac2_6',
 '9000786f2ac2_7',
 '9000786f2ac2_8',
 '9000786f2ac2_9']

In [21]:
all_ids = collection.get()["ids"]
doc_ids = set([i.split('_')[0] for i in all_ids])
len(doc_ids)

2674

In [25]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate

In [27]:
# Prompt template
prompt_template = """Answer the question in a concise manner based on the following context:
{context}

Question: {question}

If the context does not contain the answer, use other sources.
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

In [None]:
collection = ch_client.get_or_create_collection(name="legal")
if collection.count() == 0:
    print("Vectorstore is empty.")
else:
    print(f"Vectorstore loaded. [{len(doc_ids)} documents. {collection.count()} embeddings.]")
llm = ChatOpenAI(model="gpt-4o", temperature=0.1)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
vectorstore = Chroma(
    client=ch_client,
    collection_name="legal",
    embedding_function=embeddings  # same embedding function used for inserts
)

In [38]:
retriever = vectorstore.as_retriever(
    search_kwargs={
        "k": 5,
        "filter":{
            "celex-year": 2010
        }
    }
)
qa = RetrievalQA.from_chain_type(
    llm=llm,  # or whichever LLM
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

In [32]:
# Query it
response = qa.invoke("Which chemical was classified as a substance of very high concern for its endocrine-disrupting effects under the EU Commission Implementing Decision 2019/1194?")
print(response)

{'query': 'Which chemical was classified as a substance of very high concern for its endocrine-disrupting effects under the EU Commission Implementing Decision 2019/1194?', 'result': 'The chemical 4-tert-butylphenol (PTBP) was classified as a substance of very high concern for its endocrine-disrupting effects under the EU Commission Implementing Decision 2019/1194.', 'source_documents': [Document(metadata={'celex-year': 2019, 'celex-sector': '3', 'celex': '32019D1194', 'title': 'L_2019187EN.01004101.xml', 'eurovoc-terms': 'fish;endocrine disease;fish disease;chemical product;endocrine disruptor;environmental protection', 'eurovoc-mt': '5641 fisheries;2841 health;5631 agricultural activity;6811 chemistry;5216 deterioration of the environment;5206 environmental policy'}, page_content='HAS ADOPTED THIS DECISION:\n\n\n\n\n\n\n\n\nArticle 1\n\n\n\n\n1.\xa0\xa0\xa04-tert-butylphenol (PTBP) (EC No 202-679-0, CAS No 98-54-4) is identified as a substance of very high concern pursuant to Article

In [33]:
qn = "On what date did EU Regulation 402/2010 become effective?"
response = qa.invoke(qn)
response

{'query': 'On what date did EU Regulation 402/2010 become effective?',
 'result': 'The context provided does not mention EU Regulation 402/2010 or its effective date. Therefore, based on external sources, EU Regulation 402/2010 became effective on May 20, 2010.',
 'source_documents': [Document(metadata={'title': 'L_202402781EN.000101.fmx.xml', 'celex-year': 2024, 'celex-sector': '3', 'eurovoc-terms': 'market approval;plant health control;plant health product', 'eurovoc-mt': '2031 marketing;5606 agricultural policy;5626 means of agricultural production', 'celex': '32024R2781'}, page_content='HAS ADOPTED THIS REGULATION:\n\n\n\n\n\n\n\n\nArticle\xa01\n\n\nThe Annex to Implementing Regulation (EU) No\xa0540/2011 is amended in accordance with the Annex to this Regulation.\n\n\n\n\n\n\nArticle\xa02\n\n\nThis Regulation shall enter into force on the twentieth day following that of its publication in the \nOfficial Journal of the European Union\n.\n\n\n\n\n\n\n\n\n\n\nThis Regulation shall be

In [34]:
response['source_documents'][0].metadata['celex']

'32024R2781'

In [41]:
qn = "On what day should EU Regulation 402/2010 enter into force?"
response = qa.invoke(qn)
response

{'query': 'On what day should EU Regulation 402/2010 enter into force?',
 'result': 'EU Regulation 402/2010 should enter into force on the day following its publication in the Official Journal of the European Union.',
 'source_documents': [Document(metadata={'celex': '32010R0832', 'title': 'L_2010248EN.01000101.xml', 'celex-year': 2010, 'eurovoc-terms': 'management information system;Cohesion Fund;regional disparity;European Regional Development Fund;dissemination of information;administrative transparency;economic and social cohesion;European Social Fund;management audit', 'celex-sector': '3', 'eurovoc-mt': '4021 management;1021 EU finance;1616 regions and regional policy;3221 documentation;0436 executive power and public service;1016 European construction'}, page_content='(8)\n\n\n\n\n\n\nRegulation (EC) No 1828/2006 should therefore be amended accordingly.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n(9)\n\n\n\n\n\n\nFor reasons of coherence it is appropriate that the amendm

# To Solve
1. Long '\n' sequences in the document content
2. utf-8 error in processing

In [None]:
structure = {'eu': ['eurlex-download', 'eurlex-sparql', 'eurlex-sample'], 'sg': ['mas', 'sso']}
for f in ['raw', 'processed']:
    for region, sources in structure.items():
        for source in sources:
            dir_path = f"data_ingestion/{f}/{region}/{source}"
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)

In [43]:
for f in ['common', 'eu', 'sg', 'us', 'pipelines']:
    dir_path = f"data_ingestion/src/{f}"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

In [4]:
for f in ['chroma']:
    for r in ['eu', 'sg', 'us']:
        dir_path = f"/Users/demouser/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/Reg-Guru/backend/data_ingestion/{f}/{r}" 
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)