In [20]:
class HALWorkModel(BaseModel):
    title_s: List[str]
    docType_s: str
    uri_s: HttpUrl
    authFullName_s: Optional[List[str]] = None
    publicationDateY_i: Optional[int] = None
    journalTitle_s: Optional[str] = None
    class Config:
        extra = 'allow'

In [22]:
BASE_URL = "https://api.archives-ouvertes.fr/search/"
SEARCH_TERM = "Blanchot"
START_YEAR = 1998
ROWS_PER_PAGE = 100
OUTPUT_JSON_FILE = "hal_blanchot_data.json"

validated_works = []
failed_works_log = []
start = 0
num_found = 0

print("Querying HAL API to get total number of results...")
specific_query = f'(title_t:"{SEARCH_TERM}" OR abstract_t:"{SEARCH_TERM}")'
initial_params = {'q': specific_query, 'fq': f'publicationDateY_i:[{START_YEAR} TO *]', 'rows': 0}
try:
    initial_resp = requests.get(BASE_URL, params=initial_params).json()
    num_found = initial_resp.get('response', {}).get('numFound', 0)
    print(f"Found {num_found} total works to download.")
except requests.exceptions.RequestException as e:
    print(f"Initial API request failed: {e}")
    num_found = 0

if num_found > 0:
    with tqdm(total=num_found, desc="Downloading works from HAL") as pbar:
        while start < num_found:
            params = {
                'q': f'(title_t:"{SEARCH_TERM}" OR abstract_t:"{SEARCH_TERM}")',
                'fq': f'publicationDateY_i:[{START_YEAR} TO *]',
                'fl': 'title_s, authFullName_s, publicationDateY_i, journalTitle_s, uri_s, docType_s, docid',
                'wt': 'json',
                'rows': ROWS_PER_PAGE,
                'start': start,
                'sort': 'docid asc'
            }
            try:
                response = requests.get(BASE_URL, params=params)
                response.raise_for_status()
                data = response.json()
                docs = data.get('response', {}).get('docs', [])
                if not docs: break
                for doc_data in docs:
                    try:
                        validated_work = HALWorkModel.model_validate(doc_data)
                        validated_works.append(validated_work)
                    except ValidationError as e:
                        failed_works_log.append({"uri": doc_data.get("uri_s"), "error_details": e.errors()})
                pbar.update(len(docs))
                start += ROWS_PER_PAGE
                time.sleep(0.1)
            except requests.exceptions.RequestException as e:
                print(f"\nAn error occurred during download: {e}"); break

print("\n-------------------------------------------")
print(f"Download complete. Total validated records: {len(validated_works)}")

records_by_id = defaultdict(list)
for work in validated_works:
    work_dict = work.model_dump() 
    work_id = work_dict.get('uri_s')
    if work_id:
        records_by_id[work_id].append(work_dict)

duplicate_ids = []
for work_id, works_list in records_by_id.items():
    if len(works_list) > 1:
        duplicate_ids.append(work_id)

if duplicate_ids:
    print(f"\nFound {len(duplicate_ids)} IDs with duplicate entries:")
    print('\n'.join([str(url) for url in duplicate_ids]))

else:
    print("\nNo duplicate entries were found.")


print(f"\nSaving all {len(validated_works)} validated records (including duplicates)...")

records_to_save = [work.model_dump(mode='json') for work in validated_works]

with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
    json.dump(records_to_save, f, indent=2)

print(f"✅ Successfully saved all records to '{OUTPUT_JSON_FILE}'.")

Querying HAL API to get total number of results...
Found 180 total works to download.


Downloading works from HAL: 100%|██████████| 180/180 [00:01<00:00, 120.78it/s]


-------------------------------------------
Download complete. Total validated records: 180

No duplicate entries were found.

Saving all 180 validated records (including duplicates)...
✅ Successfully saved all records to 'hal_blanchot_data.json'.



