In [1]:
from pyalex import config, Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import helpers
import time
from itertools import chain
import pandas as pd
import pyarrow # engine for parquet


In [2]:
# Email Configuration for the polite pool
config.email = "bm57596@zut.edu.pl"

In [3]:
# Configuration for retrying requests
config.max_retries = 3
config.retry_backoff_factor = 0.5  # delay between two retries
config.retry_https_codes = [429, 500, 503]

In [4]:
universities_ror_url = {
    # West Pomeranian University of Technology in Szczecin
    'PL_ZUT': 'https://ror.org/0596m7f19',
    # Burgas Free University
    'BG_BFU': 'https://ror.org/02ek1bx64',
    # University of Patras
    'GR_UOP': 'https://ror.org/017wvtq80',
    # University of Dubrovnik
    'HR_UNIDU': 'https://ror.org/05yptqp13',
    # University EMUNI
    'SL_EMUNI': 'https://ror.org/03761pf32',
    # University of Sassari
    'IT_UNISS': 'https://ror.org/01bnjbv91',
    # University of the Antilles
    'FR_UAG': 'https://ror.org/02ryfmr77',
    # University of the Azores
    'PT_UAC': 'https://ror.org/04276xd64',
    # University of the Balearic Islands
    'ES_UIB': 'https://ror.org/03e10x626',
    # University Le Havre Normandie
    'FR_ULHN': 'https://ror.org/05v509s40',
    # University of the Faroe Islands
    'FO_UF': 'https://ror.org/05mwmd090',
    # Stralsund University of Applied Sciences
    'DE_HOCHSTRALSUND': 'https://ror.org/04g99jx54',
    # Åland University of Applied Sciences
    'FI_AUAS': 'https://ror.org/05mknbx32',
}

# Just ROR IDs stored in universities_ror_id dictionary 
universities_ror_id = {}
for key in universities_ror_url:
    universities_ror_id[key] = helpers.extract_id_from_url(universities_ror_url[key])

print(universities_ror_id)

{'PL_ZUT': '0596m7f19', 'BG_BFU': '02ek1bx64', 'GR_UOP': '017wvtq80', 'HR_UNIDU': '05yptqp13', 'SL_EMUNI': '03761pf32', 'IT_UNISS': '01bnjbv91', 'FR_UAG': '02ryfmr77', 'PT_UAC': '04276xd64', 'ES_UIB': '03e10x626', 'FR_ULHN': '05v509s40', 'FO_UF': '05mwmd090', 'DE_HOCHSTRALSUND': '04g99jx54', 'FI_AUAS': '05mknbx32'}


In [5]:

def fetch_papers_by_ror(ror_id, n_max=300):
    """
    Fetches research papers associated with a given Research Organization Registry (ROR) ID from the OpenAlex API.

    Args:
        ror_id (str): The ROR ID of the institution.
        n_max (int, optional): The maximum number of papers to fetch. Defaults to 300. Set to None to fetch all.

    Returns:
        list: A list of dictionaries, where each dictionary contains the extracted data of a research paper.
              Returns an empty list if an error occurs during fetching.
    """
    all_papers_data = []
    processed_count = 0
    start_time = time.time()

    try:
        query = Works().filter(institutions={"ror": ror_id})

        for work in chain(*query.paginate(per_page=200, n_max=n_max)):
            processed_count += 1

            if processed_count % 200 == 0:
                elapsed_time = time.time() - start_time
                print(
                    f"Processed {processed_count} works... (Time elapsed: {elapsed_time:.2f}s)")

            # if processed_count == 1:
            #     print("KEYS:", work.keys())

            abstract = work['abstract']
            authorships = work.get('authorships', [])
            author_names = []
            institutions_names = []

            for authorship in authorships:
                author = authorship.get('author')
                institutions = authorship.get('institutions')

                # if processed_count == 1:
                #     print("AUTHORS:", author)

                for inst in institutions:
                    name_to_add = None
                    if inst and inst.get('display_name'):
                        name_to_add = inst['display_name']
                    if name_to_add not in institutions_names:
                        institutions_names.append(name_to_add)

                if author and author.get('display_name'):
                    author_names.append(author['display_name'])

            all_papers_data.append({
                "openalex_id": work.get('id'),
                "doi": work.get('doi'),
                "language": work.get('language'),
                "type": work.get('type'),
                "title": work.get('title'),
                "publication_date": work.get('publication_date'),
                "primary_location": work.get('primary_location'),
                "open_access": work.get('open_access'),
                # "best_oa_location": work.get('best_oa_location'),
                "institutions": institutions_names,
                "authors": author_names,
                "cited_by_count": work.get('cited_by_count'),
                "fwci": work.get('fwci'),
                "citation_normalized_percentile": work.get('citation_normalized_percentile'),
                "is_retracted": work.get('is_retracted'),
                "is_paratext": work.get('is_paratext'),
                "abstract": abstract,
                "primary_topic": work.get('primary_topic'),
                "topics": work.get('topics'),
                "keywords": work.get('keywords'),
                "cited_by_api_url": work.get('cited_by_api_url'),
                "updated_date": work.get('updated_date'),
                "created_date": work.get('created_date'),
            })

    except Exception as e:
        print(f"\nAn error occurred during fetching: {e}")
        print(f"Processed {processed_count} works before the error.")
        return []

    end_time = time.time()
    print(f"\nFinished fetching.")
    print(f"Total papers found and processed: {len(all_papers_data)}")
    print(f"Total time taken: {end_time - start_time:.2f} seconds")

    return all_papers_data

# --- Example Usage ---
# ROR for the specific university
# ROR_ID = universities_ror_id['PL_ZUT']
# print(f"Fetching works for institution ROR: {ROR_ID}")

# # Fetch papers for the specified ROR ID
# example_papers_data = fetch_papers_by_ror(ROR_ID, n_max=300)


In [6]:
universities_papers = {}

for key in universities_ror_id:
    ROR_ID = universities_ror_id[key]
    print(f"Fetching works for institution {key} ROR: {ROR_ID}")
    # CHANGE n_max to None to fetch all papers
    papers_data = fetch_papers_by_ror(ROR_ID, n_max=None)
    universities_papers[key] = papers_data

print(len(universities_papers))

Fetching works for institution PL_ZUT ROR: 0596m7f19
Processed 200 works... (Time elapsed: 85.38s)
Processed 400 works... (Time elapsed: 85.40s)
Processed 600 works... (Time elapsed: 85.41s)
Processed 800 works... (Time elapsed: 85.43s)
Processed 1000 works... (Time elapsed: 85.44s)
Processed 1200 works... (Time elapsed: 85.46s)
Processed 1400 works... (Time elapsed: 85.48s)
Processed 1600 works... (Time elapsed: 85.50s)
Processed 1800 works... (Time elapsed: 85.57s)
Processed 2000 works... (Time elapsed: 85.59s)
Processed 2200 works... (Time elapsed: 85.61s)
Processed 2400 works... (Time elapsed: 85.62s)
Processed 2600 works... (Time elapsed: 85.63s)
Processed 2800 works... (Time elapsed: 85.65s)
Processed 3000 works... (Time elapsed: 85.66s)
Processed 3200 works... (Time elapsed: 85.68s)
Processed 3400 works... (Time elapsed: 85.70s)
Processed 3600 works... (Time elapsed: 85.72s)
Processed 3800 works... (Time elapsed: 85.73s)
Processed 4000 works... (Time elapsed: 85.74s)
Processed 4

### Total processing time
Total processing time for all the papers at the 13 universities is **28m15s**

In [14]:
# Advanced - Display the first few results as an example
def display_paper_data(paper_data):
    if paper_data:
        print("\n--- Example Paper Data (First Few) ---")
        for i, paper in enumerate(paper_data[0:15:5]):
            print(f"\nPaper {i+1}:")
            print("-" * 20)  # Separator for clarity

            # --- Core Identifiers & Basic Metadata ---
            print(f"  OpenAlex ID: {paper.get('openalex_id', 'N/A')}")
            print(f"  DOI: {paper.get('doi', 'N/A')}")
            print(f"  Language: {paper.get('language', 'N/A')}")
            print(f"  Type: {paper.get('type', 'N/A')}")
            print(f"  Title: {paper.get('title', 'N/A')}")
            print(f"  Publication Date: {paper.get('publication_date', 'N/A')}")

            # --- Location & Access ---
            # Primary Location (showing key info)
            primary_loc = paper.get('primary_location')
            if primary_loc and isinstance(primary_loc, dict):
                source_info = primary_loc.get('source', {})
                source_name = source_info.get(
                    'display_name', 'N/A') if source_info else 'N/A'
                lp_url = primary_loc.get('landing_page_url', 'N/A')
                print(
                    f"  Primary Location: Source='{source_name}', LandingPage='{lp_url}', IsOA={primary_loc.get('is_oa')}")
            else:
                print(
                    f"  Primary Location: {primary_loc if primary_loc else 'N/A'}")

            # Open Access (showing key info)
            oa_info = paper.get('open_access')
            if oa_info and isinstance(oa_info, dict):
                print(
                    f"  Open Access: Status='{oa_info.get('oa_status', 'N/A')}', IsOA={oa_info.get('is_oa')}, OA_URL='{oa_info.get('oa_url', 'N/A')}'")
            else:
                print(f"  Open Access: {oa_info if oa_info else 'N/A'}")

            # Best OA Location (showing key info)
            best_oa_loc = paper.get('best_oa_location')
            if best_oa_loc and isinstance(best_oa_loc, dict):
                source_info = best_oa_loc.get('source', {})
                source_name = source_info.get(
                    'display_name', 'N/A') if source_info else 'N/A'
                pdf_url = best_oa_loc.get('pdf_url', 'N/A')
                print(
                    f"  Best OA Location: Source='{source_name}', PDF='{pdf_url}', Version='{best_oa_loc.get('version')}', License='{best_oa_loc.get('license')}'")
            else:
                print(
                    f"  Best OA Location: {best_oa_loc if best_oa_loc else 'N/A'}")

            # --- Authorship & Affiliation ---
            # Institutions
            inst_list = paper.get('institutions', [])
            print(
                f"  Institutions ({len(inst_list)}): {', '.join(inst_list) if inst_list else 'N/A'}")

            # Authors
            authors_list = paper.get('authors', [])
            print(
                f"  Authors ({len(authors_list)}): {', '.join(authors_list) if authors_list else 'N/A'}")

            # --- Citation Metrics & Impact ---
            print(f"  Cited By Count: {paper.get('cited_by_count', 'N/A')}")
            # Field-Weighted Citation Impact
            print(f"  FWCI: {paper.get('fwci', 'N/A')}")
            # Year/Field Normalized
            print(
                f"  Citation Percentile: {paper.get('citation_normalized_percentile', 'N/A')}")

            # --- Status & Flags ---
            print(f"  Is Retracted: {paper.get('is_retracted', 'N/A')}")
            print(f"  Is Paratext: {paper.get('is_paratext', 'N/A')}")

            # --- Content & Topics ---
            # Abstract Preview
            abstract_text = paper.get('abstract')
            abstract_preview = (abstract_text[:400] + '...') if abstract_text and len(
                abstract_text) > 400 else abstract_text
            print(f"  Abstract: {abstract_preview if abstract_preview else 'N/A'}")

            # Primary Topic (showing display name)
            primary_topic_info = paper.get('primary_topic')
            primary_topic_name = primary_topic_info.get(
                'display_name', 'N/A') if primary_topic_info and isinstance(primary_topic_info, dict) else 'N/A'
            print(f"  Primary Topic: {primary_topic_name}")

            # Topics (showing count and first few display names)
            topics_list = paper.get('topics', [])
            topic_names = [t.get('display_name', 'N/A')
                        for t in topics_list[:3] if isinstance(t, dict)]
            print(
                f"  Topics ({len(topics_list)}): {topic_names}{'...' if len(topics_list) > 3 else ''}")

            # Keywords (showing count and first few keywords)      
            keywords_list = paper.get('keywords', [])
            keyword_strings = [k.get('display_name', 'N/A')
                            for k in keywords_list[:5] if isinstance(k, dict)]
            print(
                f"  Keywords ({len(keywords_list)}): {keyword_strings}{'...' if len(keywords_list) > 5 else ''}")

            # --- API & Metadata Timestamps ---
            print(f"  Cited By API URL: {paper.get('cited_by_api_url', 'N/A')}")
            print(f"  Updated Date: {paper.get('updated_date', 'N/A')}")
            print(f"  Created Date: {paper.get('created_date', 'N/A')}")

    else:
        print("\nNo papers found matching the criteria or an error occurred before fetching any.")

In [15]:
print(universities_papers.keys())
display_paper_data(universities_papers['FI_AUAS'])

dict_keys(['PL_ZUT', 'BG_BFU', 'GR_UOP', 'HR_UNIDU', 'SL_EMUNI', 'IT_UNISS', 'FR_UAG', 'PT_UAC', 'ES_UIB', 'FR_ULHN', 'FO_UF', 'DE_HOCHSTRALSUND', 'FI_AUAS'])

--- Example Paper Data (First Few) ---

Paper 1:
--------------------
  OpenAlex ID: https://openalex.org/W1987865306
  DOI: https://doi.org/10.1016/s0140-6736(69)91454-8
  Language: en
  Type: article
  Title: TURPENTINE AND THROMBOCYTOPENIC PURPURA
  Publication Date: 1969-07-01
  Primary Location: Source='The Lancet', LandingPage='https://doi.org/10.1016/s0140-6736(69)91454-8', IsOA=False
  Open Access: Status='closed', IsOA=False, OA_URL='None'
  Best OA Location: N/A
  Institutions (1): Åland University of Applied Sciences
  Authors (2): Peter Wahlberg, Dag Nyman
  Cited By Count: 198
  FWCI: 1.816
  Citation Percentile: {'value': 0.992992, 'is_in_top_1_percent': True, 'is_in_top_10_percent': True}
  Is Retracted: False
  Is Paratext: False
  Abstract: N/A
  Primary Topic: Neurological and metabolic disorders
  Topics (2): 

In [16]:
for uni in universities_ror_id:
    inst = Works().filter(institutions={"ror": universities_ror_id[uni]}).count()
    print(f"Number of works for {uni}: {inst}")
# for uni in universities_papers:
    # print(f"\n--- {uni} ---")

Number of works for PL_ZUT: 13803
Number of works for BG_BFU: 642
Number of works for GR_UOP: 49555
Number of works for HR_UNIDU: 2660
Number of works for SL_EMUNI: 67
Number of works for IT_UNISS: 25047
Number of works for FR_UAG: 3790
Number of works for PT_UAC: 6533
Number of works for ES_UIB: 21872
Number of works for FR_ULHN: 6977
Number of works for FO_UF: 1791
Number of works for DE_HOCHSTRALSUND: 1081
Number of works for FI_AUAS: 76


## Save the data to a Parquet file

In [None]:
all_papers_list = []
for university_key, papers_list in universities_papers.items():
    if isinstance(papers_list, list):
        for paper_doc in papers_list:
            if isinstance(paper_doc, dict):
                # Add the university key to each paper document
                paper_doc_with_uni = paper_doc.copy()
                paper_doc_with_uni['university_key'] = university_key
                all_papers_list.append(paper_doc_with_uni)

# Convert the list of dictionaries to a Pandas DataFrame
if all_papers_list:
    df = pd.DataFrame(all_papers_list)

    # Define the output file path
    parquet_file_path = 'university_papers_data.parquet'

    try:
        # Save the DataFrame to a Parquet file
        # `index=False` prevents writing the DataFrame index as a column
        # `engine='pyarrow'` is common, 'fastparquet' is another option
        df.to_parquet(parquet_file_path, index=False, engine='pyarrow')
        print(
            f"Successfully saved data for {len(df)} papers to {parquet_file_path}")


    except Exception as e:
        print(f"Error saving data to Parquet: {e}")
else:
    print("No paper data was collected to save.")


# --- Later, to read this file to index into OpenSearch ---
# df_loaded = pd.read_parquet(parquet_file_path, engine='pyarrow')
# # Convert DataFrame back to list of dicts for bulk helper
# actions_to_index = df_loaded.to_dict('records')
# # Adapt the generate_bulk_actions function to take this list

Successfully saved data for 133964 papers to university_papers_data2.parquet
