In [1]:
import pyalex
from pyalex import config, Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import helpers
import os
import json
import time
from itertools import chain

In [2]:
# Email Configuration for the polite pool
config.email = "bm57596@zut.edu.pl"

In [3]:
# Configuration for retrying requests
config.max_retries = 3
config.retry_backoff_factor = 0.5  # delay between two retries
config.retry_https_codes = [429, 500, 503]

In [4]:
universities_ror_url = {
    # West Pomeranian University of Technology in Szczecin
    'PL_ZUT': 'https://ror.org/0596m7f19',
    # Burgas Free University
    'BG_BFU': 'https://ror.org/02ek1bx64',
    # University of Patras
    'GR_UOP': 'https://ror.org/017wvtq80',
    # University of Dubrovnik
    'HR_UNIDU': 'https://ror.org/05yptqp13',
    # University EMUNI
    'SL_EMUNI': 'https://ror.org/03761pf32',
    # University of Sassari
    'IT_UNISS': 'https://ror.org/01bnjbv91',
    # University of the Antilles
    'FR_UAG': 'https://ror.org/02ryfmr77',
    # University of the Azores
    'PT_UAC': 'https://ror.org/04276xd64',
    # University of the Balearic Islands
    'ES_UIB': 'https://ror.org/03e10x626',
    # University Le Havre Normandie
    'FR_ULHN': 'https://ror.org/05v509s40',
    # University of the Faroe Islands
    'FO_UF': 'https://ror.org/05mwmd090',
    # Stralsund University of Applied Sciences
    'DE_HOCHSTRALSUND': 'https://ror.org/04g99jx54',
    # Åland University of Applied Sciences
    'FI_AUAS': 'https://ror.org/05mknbx32',
}

# Just ROR IDs stored in universities_ror_id dictionary 
universities_ror_id = {}
for key in universities_ror_url:
    universities_ror_id[key] = helpers.extract_id_from_url(universities_ror_url[key])

print(universities_ror_id)

{'PL_ZUT': '0596m7f19', 'BG_BFU': '02ek1bx64', 'GR_UOP': '017wvtq80', 'HR_UNIDU': '05yptqp13', 'SL_EMUNI': '03761pf32', 'IT_UNISS': '01bnjbv91', 'FR_UAG': '02ryfmr77', 'PT_UAC': '04276xd64', 'ES_UIB': '03e10x626', 'FR_ULHN': '05v509s40', 'FO_UF': '05mwmd090', 'DE_HOCHSTRALSUND': '04g99jx54', 'FI_AUAS': '05mknbx32'}


In [26]:
# ROR for the specific university - TESTING
ROR_ID = universities_ror_id['PL_ZUT'] 
print(f"Fetching works for institution ROR: {ROR_ID}")

# --- Data Storage ---
all_papers_data = []
processed_count = 0
start_time = time.time()

try:
    # Create a query for works filtered by the institution's ROR ID.
    # .paginate() used to handle fetching results across multiple pages automatically.
    # Setting per_page=200 (the maximum) reduces the number of API calls needed.
    # n_max=None means fetch ALL results (remove or set to a number like 100 for testing).
    query = Works().filter(institutions={"ror": ROR_ID})
    # Iterate through the generator provided by paginate()

    for work in chain(*query.paginate(per_page=200, n_max=300)):
        processed_count += 1
        # Print progress periodically
        if processed_count % 200 == 0:
            elapsed_time = time.time() - start_time
            print(
                f"Processed {processed_count} works... (Time elapsed: {elapsed_time:.2f}s)")

        # Extract required information using .get() for safety (returns None if key is missing)
        
        # TEST
        if processed_count == 1:
            print("KEYS:",work.keys())
        
        # PyAlex automatically reconstructs the abstract from 'abstract_inverted_index'.
        # It will be None if no abstract is available in OpenAlex.
        # It needs to be accessed via [] though
        abstract = work['abstract']
        # Extract author display names
        # Default to empty list if 'authorships' is missing
        authorships = work.get('authorships', [])
        author_names = []
        institutions_names = []
        for authorship in authorships:
            # Check if the author object and display_name exist
            author = authorship.get('author')
            institutions = authorship.get('institutions')

            if processed_count == 1:
                print("AUTHORS:", author)
                print("INSTITUTION:", institutions)

            for inst in institutions:
                name_to_add = None
                if inst and inst.get('display_name'):
                    name_to_add = inst['display_name']
                if name_to_add not in institutions_names:
                    institutions_names.append(name_to_add)


            if author and author.get('display_name'):
                author_names.append(author['display_name'])

        # Store the extracted data in our list
        all_papers_data.append({
            # Core Identifiers & Basic Metadata
            "openalex_id": work.get('id'),
            "doi": work.get('doi'),
            "language": work.get('language'),
            "type": work.get('type'),
            "title": work.get('title'),
            "publication_date": work.get('publication_date'),
            # Location & Access
            ## Information about the "best" source/location for this wor
            "primary_location": work.get('primary_location'),
            "open_access": work.get('open_access'),
            "best_oa_location": work.get('best_oa_location'),
            # Authorship & Affiliation
            "institutions": institutions_names,
            "authors": author_names,
            # Citation Metrics & Impact
            "cited_by_count": work.get('cited_by_count'),
              ## Field-Weighted Citation Impact, >1 is above average, <1 is below.
            "fwci": work.get('fwci'),
              ## Citation percentile (0-99) normalized by publication year and field (using primary topic). 99 means it's in the top 1% most cited for its cohort.
            "citation_normalized_percentile": work.get('citation_normalized_percentile'),
            # Status & Flags
            "is_retracted": work.get('is_retracted'),
            "is_paratext": work.get('is_paratext'),
            # Content & Topics
            "abstract": abstract,
            "primary_topic": work.get('primary_topic'),
            # a standardized, consistent way to categorize the work's subject matter according to OpenAlex
            "topics": work.get('topics'),
            # specific terms the authors themselves used to label their work.
            "keywords": work.get('keywords'),
            # API & Metadata Timestamps
            "cited_by_api_url": work.get('cited_by_api_url'),
            "updated_date": work.get('updated_date'),
            "created_date": work.get('created_date'),

        })


except Exception as e:
    print(f"\nAn error occurred during fetching: {e}")
    print(f"Processed {processed_count} works before the error.")

# --- Output Results ---
end_time = time.time()
print(f"\nFinished fetching.")
print(f"Total papers found and processed: {len(all_papers_data)}")
print(f"Total time taken: {end_time - start_time:.2f} seconds")

# Basic - Display the first few results as an example
if all_papers_data:
    print("\n--- Example Paper Data (First 3) ---")
    for i, paper in enumerate(all_papers_data[:3]):
        print(f"\nPaper {i+1}:")
        print(f"  ID: {paper['openalex_id']}")
        print(f"  Title: {paper['title']}")
        print(f"  Date: {paper['publication_date']}")
        # Show a preview of the abstract
        abstract_preview = (paper['abstract'][:300] + '...') if paper['abstract'] and len(
            paper['abstract']) > 300 else paper['abstract']
        print(f"  Abstract: {abstract_preview if abstract_preview else 'N/A'}")
        print(f"  Authors: {', '.join(paper['authors'])}")
else:
    print("\nNo papers found for this institution or an error occurred before fetching any.")

Fetching works for institution ROR: 0596m7f19
KEYS: dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'institution_assertions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'datasets', 'versions', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'abstract_inverted_index_v3', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])
AUTHORS: {'id': 'https://openalex.org/A5108230219', 'display_name':

In [27]:
# Advanced - Display the first few results as an example
if all_papers_data:
    print("\n--- Example Paper Data (First 3) ---")
    for i, paper in enumerate(all_papers_data[:3]):
        print(f"\nPaper {i+1}:")
        print("-" * 20)  # Separator for clarity

        # --- Core Identifiers & Basic Metadata ---
        print(f"  OpenAlex ID: {paper.get('openalex_id', 'N/A')}")
        print(f"  DOI: {paper.get('doi', 'N/A')}")
        print(f"  Language: {paper.get('language', 'N/A')}")
        print(f"  Type: {paper.get('type', 'N/A')}")
        print(f"  Title: {paper.get('title', 'N/A')}")
        print(f"  Publication Date: {paper.get('publication_date', 'N/A')}")

        # --- Location & Access ---
        # Primary Location (showing key info)
        primary_loc = paper.get('primary_location')
        if primary_loc and isinstance(primary_loc, dict):
            source_info = primary_loc.get('source', {})
            source_name = source_info.get(
                'display_name', 'N/A') if source_info else 'N/A'
            lp_url = primary_loc.get('landing_page_url', 'N/A')
            print(
                f"  Primary Location: Source='{source_name}', LandingPage='{lp_url}', IsOA={primary_loc.get('is_oa')}")
        else:
            print(
                f"  Primary Location: {primary_loc if primary_loc else 'N/A'}")

        # Open Access (showing key info)
        oa_info = paper.get('open_access')
        if oa_info and isinstance(oa_info, dict):
            print(
                f"  Open Access: Status='{oa_info.get('oa_status', 'N/A')}', IsOA={oa_info.get('is_oa')}, OA_URL='{oa_info.get('oa_url', 'N/A')}'")
        else:
            print(f"  Open Access: {oa_info if oa_info else 'N/A'}")

        # Best OA Location (showing key info)
        best_oa_loc = paper.get('best_oa_location')
        if best_oa_loc and isinstance(best_oa_loc, dict):
            source_info = best_oa_loc.get('source', {})
            source_name = source_info.get(
                'display_name', 'N/A') if source_info else 'N/A'
            pdf_url = best_oa_loc.get('pdf_url', 'N/A')
            print(
                f"  Best OA Location: Source='{source_name}', PDF='{pdf_url}', Version='{best_oa_loc.get('version')}', License='{best_oa_loc.get('license')}'")
        else:
            print(
                f"  Best OA Location: {best_oa_loc if best_oa_loc else 'N/A'}")

        # --- Authorship & Affiliation ---
         # Institutions
        inst_list = paper.get('institutions', [])
        print(
            f"  Institutions ({len(inst_list)}): {', '.join(inst_list) if inst_list else 'N/A'}")

        # Authors
        authors_list = paper.get('authors', [])
        print(
            f"  Authors ({len(authors_list)}): {', '.join(authors_list) if authors_list else 'N/A'}")

        # --- Citation Metrics & Impact ---
        print(f"  Cited By Count: {paper.get('cited_by_count', 'N/A')}")
        # Field-Weighted Citation Impact
        print(f"  FWCI: {paper.get('fwci', 'N/A')}")
        # Year/Field Normalized
        print(
            f"  Citation Percentile: {paper.get('citation_normalized_percentile', 'N/A')}")

        # --- Status & Flags ---
        print(f"  Is Retracted: {paper.get('is_retracted', 'N/A')}")
        print(f"  Is Paratext: {paper.get('is_paratext', 'N/A')}")

        # --- Content & Topics ---
        # Abstract Preview
        abstract_text = paper.get('abstract')
        abstract_preview = (abstract_text[:400] + '...') if abstract_text and len(
            abstract_text) > 400 else abstract_text
        print(f"  Abstract: {abstract_preview if abstract_preview else 'N/A'}")

        # Primary Topic (showing display name)
        primary_topic_info = paper.get('primary_topic')
        primary_topic_name = primary_topic_info.get(
            'display_name', 'N/A') if primary_topic_info and isinstance(primary_topic_info, dict) else 'N/A'
        print(f"  Primary Topic: {primary_topic_name}")

        # Topics (showing count and first few display names)
        topics_list = paper.get('topics', [])
        topic_names = [t.get('display_name', 'N/A')
                       for t in topics_list[:3] if isinstance(t, dict)]
        print(
            f"  Topics ({len(topics_list)}): {topic_names}{'...' if len(topics_list) > 3 else ''}")

        # Keywords (showing count and first few keywords)      
        keywords_list = paper.get('keywords', [])
        keyword_strings = [k.get('display_name', 'N/A')
                           for k in keywords_list[:5] if isinstance(k, dict)]
        print(
            f"  Keywords ({len(keywords_list)}): {keyword_strings}{'...' if len(keywords_list) > 5 else ''}")

        # --- API & Metadata Timestamps ---
        print(f"  Cited By API URL: {paper.get('cited_by_api_url', 'N/A')}")
        print(f"  Updated Date: {paper.get('updated_date', 'N/A')}")
        print(f"  Created Date: {paper.get('created_date', 'N/A')}")

else:
    print("\nNo papers found matching the criteria or an error occurred before fetching any.")


--- Example Paper Data (First 3) ---

Paper 1:
--------------------
  OpenAlex ID: https://openalex.org/W2059474375
  DOI: https://doi.org/10.1016/j.progpolymsci.2012.04.003
  Language: en
  Type: article
  Title: Biocomposites reinforced with natural fibers: 2000–2010
  Publication Date: 2012-05-02
  Primary Location: Source='Progress in Polymer Science', LandingPage='https://doi.org/10.1016/j.progpolymsci.2012.04.003', IsOA=False
  Open Access: Status='closed', IsOA=False, OA_URL='None'
  Best OA Location: N/A
  Institutions (4): University of Toronto, University of Kassel, West Pomeranian University of Technology, Fraunhofer Institute for Applied Polymer Research
  Authors (4): Omar Faruk, Andrzej K. Błędzki, Hans‐Peter Fink, Mohini Sain
  Cited By Count: 3697
  FWCI: 129.917
  Citation Percentile: {'value': 0.999898, 'is_in_top_1_percent': True, 'is_in_top_10_percent': True}
  Is Retracted: False
  Is Paratext: False
  Abstract: N/A
  Primary Topic: Natural Fiber Reinforced Compos