In [None]:
import json
import csv
import pprint
from http.client import responses

from pyalex import Works
from typing import List, Optional, Literal
from pydantic import BaseModel, HttpUrl, ValidationError, validator, ConfigDict, Field, model_validator



In [71]:
class Affiliation(BaseModel):
    raw_affiliation_string: str
    institution_ids: list[HttpUrl]


class DehydratedAuthor(BaseModel):
    id: HttpUrl
    display_name: str
    orcid: Optional[HttpUrl]


class Institution(BaseModel):
    id: HttpUrl
    display_name: str
    ror: HttpUrl
    country_code: str
    type: str
    lineage: HttpUrl


class Authorship(BaseModel):
    model_config = ConfigDict(extra='allow')
    affiliations: list[Affiliation]
    author: DehydratedAuthor
    author_position: Optional[Literal['first', 'middle', 'last']]


    # # TODO: see if there are standardized ways to represent a country
    countries: list[str]
    institutions: list[Institution]
    is_corresponding: Optional[bool]


class DehydratedSource(BaseModel):
    id: HttpUrl
    display_name: str
    issn_l: str = Field(pattern=r'^\d{4}-\d{3}[\dX]$')
    issn: list[str]
    host_organization: HttpUrl
    type: Literal['journal', 'repository', 'conference', 'ebook', 'platform', 'book series', 'metadata', 'other']

    # TODO: figure out how to validate
    @model_validator(mode='after')
    def validate_issn(self):
        # for id in self.issn:
        #   re.
        pass



class Location(BaseModel):
    is_oa: bool
    landing_page_url: HttpUrl
    pdf_url: Optional[HttpUrl]
    source: DehydratedSource
    license: Optional[str]
    version: Optional[Literal['publishedVersion', 'acceptedVersion', 'submittedVersion']]
    is_accepted: bool
    is_published: bool

In [72]:
import json

test = json.loads('''{
        "is_oa": false,
        "landing_page_url": "https://doi.org/10.2307/3736058",
        "pdf_url": null,
        "source": {
          "id": "https://openalex.org/S93676754",
          "display_name": "The Modern Language Review",
          "issn_l": "0026-7937",
          "issn": [
            "0026-7937",
            "2222-4319"
          ],
          "is_oa": false,
          "is_in_doaj": false,
          "is_indexed_in_scopus": true,
          "is_core": false,
          "host_organization": "https://openalex.org/P4310321404",
          "host_organization_name": "Modern Humanities Research Association",
          "host_organization_lineage": [
            "https://openalex.org/P4310321404"
          ],
          "host_organization_lineage_names": [
            "Modern Humanities Research Association"
          ],
          "type": "journal"
        },
        "license": null,
        "license_id": null,
        "version": null,
        "is_accepted": false,
        "is_published": false
      }
    ''')
# TODO: [FINE FOR NOW---RETURN TO LATER] figure out issue causing pydantic to throw type=missing validation errors (see G for details)

In [73]:
Location.model_validate(test)

Location(is_oa=False, landing_page_url=HttpUrl('https://doi.org/10.2307/3736058'), pdf_url=None, source=None, license=None, version=None, is_accepted=False, is_published=False)

In [62]:
import pandas as pd, requests, urllib.parse, time
from tqdm import tqdm
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, HttpUrl, Field, ValidationError

# Dehydrated objects (minimal, nested representations)
class DehydratedConceptModel(BaseModel):
    id: HttpUrl
    display_name: str
    level: int
    score: float


class DehydratedInstitutionModel(BaseModel):
    id: HttpUrl
    display_name: str
    ror: Optional[HttpUrl] = None
    country_code: str
    type: str


class DehydratedAuthorModel(BaseModel):
    id: HttpUrl
    display_name: str
    orcid: Optional[HttpUrl] = None


# Nested objects that are part of a Work
class Biblio(BaseModel):
    volume: Optional[str] = None
    issue: Optional[str] = None
    first_page: Optional[str] = None
    last_page: Optional[str] = None


class Location(BaseModel):
    is_oa: bool
    landing_page_url: Optional[HttpUrl] = None
    pdf_url: Optional[HttpUrl] = None
    source: Optional[Dict[str, Any]] = None
    license: Optional[str] = None
    version: Optional[str] = None
    is_accepted: Optional[bool] = None
    is_published: Optional[bool] = None


class Authorship(BaseModel):
    author_position: str
    author: DehydratedAuthorModel
    institutions: List[DehydratedInstitutionModel]


# The main, top-level Work object model
class ComprehensiveWorkModel(BaseModel):
    id: HttpUrl
    doi: Optional[HttpUrl] = None
    title: str
    publication_year: Optional[int] = None
    publication_date: str
    language: Optional[str] = None
    type: str
    authorships: List[Authorship]
    primary_location: Optional[Location] = None
    locations: List[Location]
    cited_by_count: int
    referenced_works: List[HttpUrl]
    related_works: List[HttpUrl]
    concepts: List[DehydratedConceptModel]
    # We add 'extra="allow"' in a ConfigDict if we want to be less strict
    # For full validation, we omit it, and any extra field will raise an error.


BASE = "https://api.openalex.org/works"
filters = "title_and_abstract.search:Blanchot,publication_year: 1998-2025"
per_page = 200
records=[]
cursor = "*"
with tqdm(desc='Downloading', unit='page') as pbar:
    while True:
        url = f"{BASE}?filter={filters}&per_page={per_page}&cursor={cursor}"
        resp = requests.get(url).json()
        if pbar.total == None: pbar.total = resp['meta']['count']
        records.extend (resp['results'])
        pbar.update(len (resp['results']))

        next_cursor = resp['meta'].get('next_cursor')
        if not next_cursor:
            break
        cursor = next_cursor
        time.sleep(0.2)

Downloading: 4060page [00:11, 357.10page/s]                        
