# This notebook fetches all the EUNICoast authors from OpenAlex 

In [1]:
from pyalex import config, Authors
import helpers
import time
from itertools import chain
import pandas as pd
import pyarrow  # engine for parquet

In [2]:
# Email Configuration for the polite pool
config.email = "bm57596@zut.edu.pl"

# Configuration for retrying requests
config.max_retries = 3
config.retry_backoff_factor = 0.5  # delay between two retries
config.retry_https_codes = [429, 500, 503]

universities_ror_url = {
    # West Pomeranian University of Technology in Szczecin
    'PL_ZUT': 'https://ror.org/0596m7f19',
    # Burgas Free University
    'BG_BFU': 'https://ror.org/02ek1bx64',
    # University of Patras
    'GR_UOP': 'https://ror.org/017wvtq80',
    # University of Dubrovnik
    'HR_UNIDU': 'https://ror.org/05yptqp13',
    # University EMUNI
    'SL_EMUNI': 'https://ror.org/03761pf32',
    # University of Sassari
    'IT_UNISS': 'https://ror.org/01bnjbv91',
    # University of the Antilles
    'FR_UAG': 'https://ror.org/02ryfmr77',
    # University of the Azores
    'PT_UAC': 'https://ror.org/04276xd64',
    # University of the Balearic Islands
    'ES_UIB': 'https://ror.org/03e10x626',
    # University Le Havre Normandie
    'FR_ULHN': 'https://ror.org/05v509s40',
    # University of the Faroe Islands
    'FO_UF': 'https://ror.org/05mwmd090',
    # Stralsund University of Applied Sciences
    'DE_HOCHSTRALSUND': 'https://ror.org/04g99jx54',
    # Åland University of Applied Sciences
    'FI_AUAS': 'https://ror.org/05mknbx32',
}

# Just ROR IDs stored in universities_ror_id dictionary
universities_ror_id = {}
for key in universities_ror_url:
    universities_ror_id[key] = helpers.extract_id_from_url(
        universities_ror_url[key])

print(universities_ror_id)

{'PL_ZUT': '0596m7f19', 'BG_BFU': '02ek1bx64', 'GR_UOP': '017wvtq80', 'HR_UNIDU': '05yptqp13', 'SL_EMUNI': '03761pf32', 'IT_UNISS': '01bnjbv91', 'FR_UAG': '02ryfmr77', 'PT_UAC': '04276xd64', 'ES_UIB': '03e10x626', 'FR_ULHN': '05v509s40', 'FO_UF': '05mwmd090', 'DE_HOCHSTRALSUND': '04g99jx54', 'FI_AUAS': '05mknbx32'}


In [3]:
def fetch_authors_by_ror(ror_id, n_max=400):
    all_authors_data = []
    processed_count = 0
    start_time = time.time()

    try:
        query = Authors().filter(affiliations={"institution": {"ror":ror_id}})
        
        for author in chain(*query.paginate(per_page=200, n_max=n_max)):
            processed_count += 1

            if processed_count % 200 == 0:
                elapsed_time = time.time() - start_time
                print(
                    f"Processed {processed_count} authors... (Time elapsed: {elapsed_time:.2f}s)")
                
            if processed_count == 1:
                print("Keys:", author.keys())
                print("AUTHOR:", author)
                        
            all_authors_data.append({
                "affiliations": author.get('affiliations', []),
                "cited_by_count": author.get('cited_by_count'),
                "last_known_insitution":author.get('last_known_institution', []),
                "orcid": author.get('orcid'),
                "scopus": author.get('scopus'),
                "summary_stats":author.get('summary_get',[]),
                "works_count":author.get('works_count'),
                "concepts":author.get('concepts',[]),
                'openalex_id':author.get('openalex')
            })
        
        end_time = time.time()
        print(f"\nFinished fetching.")
        print(f"Total authors found and processed: {len(all_authors_data)}")
        print(f"Total time taken: {end_time - start_time:.2f} seconds")
        
        return all_authors_data

    except Exception as e:
        print(f"\nAn error occurred during fetching: {e}")
        print(f"Processed {processed_count} authors before the error.")
        return []
            
# ROR for the specific university
ROR_ID = universities_ror_id['PL_ZUT']
print(f"Fetching authors for institution ROR: {ROR_ID}")
# # Fetch authors for the specified ROR ID
example_authors_data = fetch_authors_by_ror(ROR_ID)




Fetching authors for institution ROR: 0596m7f19
Keys: dict_keys(['id', 'orcid', 'display_name', 'display_name_alternatives', 'works_count', 'cited_by_count', 'summary_stats', 'ids', 'affiliations', 'last_known_institutions', 'topics', 'topic_share', 'x_concepts', 'counts_by_year', 'works_api_url', 'updated_date', 'created_date'])
AUTHOR: {'id': 'https://openalex.org/A5010715704', 'orcid': 'https://orcid.org/0000-0002-2241-9764', 'display_name': 'Оleg G. Sinyashin', 'display_name_alternatives': ['O. G. Sinyashin', "Oleg Gerol'dovich Sinyashin", 'Оleg G. Sinyashin', 'Oleg G. Sinyashin', 'Oleg. G. Sinyashin', 'O.G Sinyashin', 'O. Sinyashin', 'O. G. Sinyashina', 'Oleg Geroldovich Sinyashin', 'O. G. Sinyáshin', 'Oleg Sinyashin'], 'works_count': 1141, 'cited_by_count': 8358, 'summary_stats': {'2yr_mean_citedness': 1.783132530120482, 'h_index': 41, 'i10_index': 259}, 'ids': {'openalex': 'https://openalex.org/A5010715704', 'orcid': 'https://orcid.org/0000-0002-2241-9764'}, 'affiliations': [{'i

In [4]:
universities_authors = {}

for key in universities_ror_id:
    ror_id = universities_ror_id[key]
    print(f"Fetching authors for institution {key}")
    # n_max change to None to fetch all
    authors_data = fetch_authors_by_ror(ror_id, n_max=None)
    universities_authors[key] = authors_data

print(len(universities_authors))

Fetching authors for institution PL_ZUT
Keys: dict_keys(['id', 'orcid', 'display_name', 'display_name_alternatives', 'works_count', 'cited_by_count', 'summary_stats', 'ids', 'affiliations', 'last_known_institutions', 'topics', 'topic_share', 'x_concepts', 'counts_by_year', 'works_api_url', 'updated_date', 'created_date'])
AUTHOR: {'id': 'https://openalex.org/A5010715704', 'orcid': 'https://orcid.org/0000-0002-2241-9764', 'display_name': 'Оleg G. Sinyashin', 'display_name_alternatives': ['O. G. Sinyashin', "Oleg Gerol'dovich Sinyashin", 'Оleg G. Sinyashin', 'Oleg G. Sinyashin', 'Oleg. G. Sinyashin', 'O.G Sinyashin', 'O. Sinyashin', 'O. G. Sinyashina', 'Oleg Geroldovich Sinyashin', 'O. G. Sinyáshin', 'Oleg Sinyashin'], 'works_count': 1141, 'cited_by_count': 8358, 'summary_stats': {'2yr_mean_citedness': 1.783132530120482, 'h_index': 41, 'i10_index': 259}, 'ids': {'openalex': 'https://openalex.org/A5010715704', 'orcid': 'https://orcid.org/0000-0002-2241-9764'}, 'affiliations': [{'instituti

In [6]:
all_authors_list = []

for university_key, authors_list in universities_authors.items():
    if isinstance(authors_list, list):
        for author_doc in authors_list:
            if isinstance(author_doc, dict):
                author_doc_with_uni = author_doc.copy()
                author_doc_with_uni['university_key'] = university_key
                all_authors_list.append(author_doc_with_uni)

if all_authors_list:
    df = pd.DataFrame(all_authors_list)
    parquet_file_path = 'authors_raw_data.parquet'
    try:
        df.to_parquet(parquet_file_path, index=False, engine='pyarrow')
        print(
            f"Successfully saved data for {len(df)} authors to {parquet_file_path}")
    except Exception as e:
        print(f"Error saving data to Parquet: {e}")
else:
    print("No authors data was collected to save")

Successfully saved data for 44381 authors to authors_raw_data.parquet
