https://support.datacite.org/docs/api-get-lists

In [5]:
import requests
import pandas as pd
import numpy as np
import time
from dataCite_fun import getDataCiteCitations_relationTypes
from crossRef_fun import getPublicationInfo

In [None]:
datacite_url = 'https://api.datacite.org/events'
# https://api.datacite.org/events?page[cursor]=1&page[size]=1000
headers = {
    'prefix': '10.5285',
    'page[cursor]': '1',
    'page[size]': '10'
}
r = requests.get(datacite_url, headers)
print(r.status_code)
r.json().keys()


In [None]:
r.json()['data']

In [None]:
r.json()['links']

In [None]:
# get more info from datacite API 
# hopefully end up with these columns:
# ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'relation_type_id', 'publication_doi', 'publication_type', 'publication_title', 'publication_authors']

pubDOI = '10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359'
r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
print(r.status_code)
print(r.json()['data']['attributes'].keys())
r.json()['data']['attributes']


## Workflow

In [None]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to', 'is-part-of']

In [None]:
dataCite_df_relationTypes = getDataCiteCitations_relationTypes.getDataCiteCitations_relationTypes(relation_type_id_list)

In [None]:
dataCite_df_relationTypes

In [None]:
# remove http from DOI url
doi_list = []
for url in dataCite_df_relationTypes['subj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['data_doi'] = doi_list

dataCite_df_relationTypes = dataCite_df_relationTypes.drop(['subj-id'], axis=1)

doi_list = []
for url in dataCite_df_relationTypes['obj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['subj_id'] = doi_list # rename to work in getPublicationInfo function 
 # they seem to be the wrong way round from datacite - double check this

dataCite_df_relationTypes = dataCite_df_relationTypes.rename(columns={'obj-id':'subj_id'})

In [None]:
dataCite_df_relationTypes

In [None]:
# get dataset metadata 
info_list = []
headers = {'client-id': 'bl.nerc'}
api_url = 'https://api.datacite.org/dois/' 
for doi in dataCite_df_relationTypes['data_doi']:
    r = requests.get((api_url + doi), headers)
    print(r.status_code, doi)
    
    try:
        # process author info
        author_list = []
        for item in r.json()['data']['attributes']['creators']:
            author_list.append(item['name'])

        info_list.append([
            r.json()['data']['attributes']['publisher'],
            doi,
            r.json()['data']['attributes']['titles'][0]['title'],
            author_list,
            r.json()['data']['attributes']['publicationYear'],
            r.json()['data']['attributes']['dates'],
            r.json()['data']['attributes']['registered'],           
        ])
    except Exception as e:
        info_list.append(["error",doi,"error","error","error","error","error"])
        
columns = ['data_publisher', 'data_doi_1', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered']
dataCite_meta_df = pd.DataFrame(info_list, columns = columns)    
print("Done!")
    

In [None]:
#concatenate citation and metadata dataframes
dataCite_df = pd.concat([dataCite_meta_df, dataCite_df_relationTypes], axis=1)

# test if data dois are the same on each row after concatenating the two dataframes
assert all(dataCite_df['data_doi_1'] == dataCite_df['data_doi'])

In [None]:
# rename columns
dataCite_df = dataCite_df.drop(['data_doi'], axis=1)
dataCite_df = dataCite_df.rename(columns={'data_doi_1':'data_doi'})
dataCite_df.columns

In [None]:
dataCite_df.columns

In [None]:

dataCite_df.columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors',
       'publicationYear', 'dates', 'registered', 'id', 'subj_id', 'source-id',
       'relation-type-id', 'occurred-at', 'Page endpoint', 'subj_doi']

In [None]:
dataCite_df.to_csv("dataCite_events_df.csv", index = False)

In [2]:
dataCite_df = pd.read_csv("dataCite_events_df.csv")

In [3]:
dataCite_df_publication_meta = getPublicationInfo.getPublicationInfo(dataCite_df)

https://doi.org/10.1016/j.scitotenv.2012.05.023
https://doi.org/10.1002/2015gl065750
https://doi.org/10.17863/cam.20713
https://doi.org/10.1111/1365-2656.12728
https://doi.org/10.5194/cp-2017-18
https://doi.org/10.1002/2016gl068130
https://doi.org/10.1029/2007gl032529
https://doi.org/10.1029/2009gl040104
https://doi.org/10.1029/2009jd012263
https://doi.org/10.1002/2015gl065750
https://doi.org/10.3189/172756494794587438
https://doi.org/10.1029/2007gl032529
https://doi.org/10.1029/2009gl040104
https://doi.org/10.1111/1365-2656.12798
https://doi.org/10.5194/cp-2017-18
https://doi.org/10.1029/2018jc013982
https://doi.org/10.1175/jcli-d-17-0320.1
https://doi.org/10.1111/ele.13129
https://doi.org/10.1016/j.jenvrad.2017.06.024
https://doi.org/10.1029/2018jc014464
https://doi.org/10.5285/4859dc19-e8e9-4148-8c50-cb2ab16dc696
https://doi.org/10.5285/65abc40d-e256-414b-8b50-a5569556d1be
https://doi.org/10.5285/c11bdb27-df44-4b56-8f4c-afc51b6e1e3a
https://doi.org/10.1016/j.cub.2017.04.034
https://

In [6]:
dataCite_df_publication_meta = pd.read_csv("dataCite_df_publication_meta.csv")

In [7]:
dataCite_df_publication_meta

Unnamed: 0,data_publisher,data_doi,data_title,data_authors,publicationYear,dates,registered,id,subj_id,source-id,relation-type-id,occurred-at,Page endpoint,subj_doi,pub_Title,pub_authors,publisher
0,NERC EDS Environmental Information Data Centre,10.5285/2641515f-5b76-445c-a936-1da51bf365ad,Exposure of burrowing mammals to Radon Rn-222 ...,"['Beresford, N.A.', 'Barnett, C.L.', 'Vives I ...",2012,"[{'date': '2012-05-29', 'dateType': 'Submitted...",2012-05-29T13:58:37.000Z,26a0d78f-5efe-4ac9-9aed-6654d2a3af61,https://doi.org/10.1016/j.scitotenv.2012.05.023,datacite-crossref,is-cited-by,2019-08-02T01:05:30.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.1016/j.scitotenv.2012.05.023,Exposure of burrowing mammals to 222Rn,"[['N.A.', 'Beresford'], ['C.L.', 'Barnett'], [...",Elsevier BV
1,"Polar Data Centre; British Antarctic Survey, N...",10.5285/6feac38a-5847-46f9-84e4-e7e9d291f935,Snow accumulation from the Bryan Coast ice cor...,"['Thomas, Elizabeth']",2017,"[{'date': '2010', 'dateType': 'Collected'}, {'...",2017-07-10T16:43:38.000Z,8875aad2-cbde-41c3-93b3-d63cb21f4d2d,https://doi.org/10.1002/2015gl065750,datacite-crossref,is-cited-by,2019-08-01T11:28:04.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.1002/2015gl065750,Twentieth century increase in snowfall in coas...,"[['E. R.', 'Thomas'], ['J. S.', 'Hosking'], ['...",American Geophysical Union (AGU)
2,NERC EDS Environmental Information Data Centre,10.5285/db55406b-c9a1-4a9e-88c2-2abbcb4bcad3,Foraging behaviour of Parus major held in temp...,"['Thorogood, R', 'Kokko, H', 'Mappes, J']",2017,"[{'date': '2017-10-20', 'dateType': 'Submitted...",2017-10-20T09:10:28.000Z,6fbc8dba-128b-47a1-a137-8acc251cf530,https://doi.org/10.17863/cam.20713,datacite-related,is-cited-by,2019-10-14T08:32:43.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.17863/cam.20713,Info not given,Info not given,Info not given
3,NERC EDS Environmental Information Data Centre,10.5285/5321bc6e-be35-4ed3-9b56-25598d61ac8f,Invertebrate activity data from an experiment ...,"['Griffiths, H.M.', 'Ashton, L.A.', 'Walker, A...",2017,"[{'date': '2017-06-29', 'dateType': 'Submitted...",2017-06-29T14:31:55.000Z,587dccf9-8c7f-4593-88f9-7a82cb80cdbe,https://doi.org/10.1111/1365-2656.12728,datacite-crossref,is-cited-by,2019-10-01T19:01:02.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.1111/1365-2656.12728,Ants are the major agents of resource removal ...,"[['Hannah M.', 'Griffiths'], ['Louise A.', 'As...",Wiley
4,"Polar Data Centre, Natural Environment Researc...",10.5285/c4ecfe25-12f2-453b-ad19-49a19e90ee32,Antarctic regional snow accumulation composite...,"['Thomas, Elizabeth']",2017,"[{'date': '2017-07', 'dateType': 'Accepted'}, ...",2017-07-13T10:43:51.000Z,fb23ff88-1148-4505-829f-c5289a5a035a,https://doi.org/10.5194/cp-2017-18,datacite-crossref,is-cited-by,2019-08-01T11:27:16.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5194/cp-2017-18,Review of regional Antarctic snow accumulation...,"[['Elizabeth R.', 'Thomas'], ['J. Melchior', '...",Copernicus GmbH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1836,NERC EDS UK Polar Data Centre,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,fd1a53b9-3c4e-43e7-857a-61add92a61d0,https://gtr.ukri.org/projects?ref=ne%2Fg013187...,datacite-url,is-part-of,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://gtr.ukri.org/projects?ref=ne%2Fg013187...,Info not given,Info not given,Info not given
1837,NERC EDS UK Polar Data Centre,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,a63cb7b5-5caf-4ece-a436-2ffe0c80c652,https://gtr.ukri.org/projects?ref=ne%2Fg014159...,datacite-url,is-part-of,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://gtr.ukri.org/projects?ref=ne%2Fg014159...,Info not given,Info not given,Info not given
1838,NERC EDS UK Polar Data Centre,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,24afe93c-f4c0-4539-8196-8d5c499cdfdf,https://www.bas.ac.uk/project/basal-conditions...,datacite-url,is-part-of,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://www.bas.ac.uk/project/basal-conditions...,Info not given,Info not given,Info not given
1839,NERC EDS UK Polar Data Centre,10.5285/beda45d1-dd33-4666-8861-b4b91af0180f,Simulated changes in East Antarctic mass balan...,"['Jordan, James', 'Stokes, Chris', 'Miles, Ber...",2023,"[{'date': '2020-01-01/2220-01-01', 'dateType':...",2023-02-24T11:53:54.000Z,f04dd43a-d799-4d31-aae9-b13c25357048,https://gtr.ukri.org/projects?ref=ne%2Fr000719...,datacite-url,is-part-of,2023-04-03T10:12:22.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://gtr.ukri.org/projects?ref=ne%2Fr000719...,Info not given,Info not given,Info not given


In [8]:
newPublisherLst = []
for dataCentreName in dataCite_df_publication_meta['data_publisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass

    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
dataCite_df_publication_meta['publisher_processed'] = newPublisherLst


In [9]:
dataCite_df_publication_meta['publisher_processed'].value_counts()

Environmental Information Data Centre (EIDC)     921
Polar Data Centre (PDC)                          430
Centre for Environmental Data Analysis (CEDA)    410
error                                             71
British Oceanographic Data Centre (BODC)           6
SEANOE                                             1
University of Leeds                                1
Dryad                                              1
Name: publisher_processed, dtype: int64

In [11]:
dataCite_df_publication_meta = dataCite_df_publication_meta.drop(['data_publisher'], axis=1)
dataCite_df_publication_meta = dataCite_df_publication_meta.rename(columns={'publisher_processed':'data_publisher'})

In [13]:
dataCite_df_publication_meta.to_csv("dataCite_df_events_publication_meta.csv", index = False)

In [12]:
dataCite_df_publication_meta

Unnamed: 0,data_doi,data_title,data_authors,publicationYear,dates,registered,id,subj_id,source-id,relation-type-id,occurred-at,Page endpoint,subj_doi,pub_Title,pub_authors,publisher,data_publisher
0,10.5285/2641515f-5b76-445c-a936-1da51bf365ad,Exposure of burrowing mammals to Radon Rn-222 ...,"['Beresford, N.A.', 'Barnett, C.L.', 'Vives I ...",2012,"[{'date': '2012-05-29', 'dateType': 'Submitted...",2012-05-29T13:58:37.000Z,26a0d78f-5efe-4ac9-9aed-6654d2a3af61,https://doi.org/10.1016/j.scitotenv.2012.05.023,datacite-crossref,is-cited-by,2019-08-02T01:05:30.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.1016/j.scitotenv.2012.05.023,Exposure of burrowing mammals to 222Rn,"[['N.A.', 'Beresford'], ['C.L.', 'Barnett'], [...",Elsevier BV,Environmental Information Data Centre (EIDC)
1,10.5285/6feac38a-5847-46f9-84e4-e7e9d291f935,Snow accumulation from the Bryan Coast ice cor...,"['Thomas, Elizabeth']",2017,"[{'date': '2010', 'dateType': 'Collected'}, {'...",2017-07-10T16:43:38.000Z,8875aad2-cbde-41c3-93b3-d63cb21f4d2d,https://doi.org/10.1002/2015gl065750,datacite-crossref,is-cited-by,2019-08-01T11:28:04.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.1002/2015gl065750,Twentieth century increase in snowfall in coas...,"[['E. R.', 'Thomas'], ['J. S.', 'Hosking'], ['...",American Geophysical Union (AGU),Polar Data Centre (PDC)
2,10.5285/db55406b-c9a1-4a9e-88c2-2abbcb4bcad3,Foraging behaviour of Parus major held in temp...,"['Thorogood, R', 'Kokko, H', 'Mappes, J']",2017,"[{'date': '2017-10-20', 'dateType': 'Submitted...",2017-10-20T09:10:28.000Z,6fbc8dba-128b-47a1-a137-8acc251cf530,https://doi.org/10.17863/cam.20713,datacite-related,is-cited-by,2019-10-14T08:32:43.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.17863/cam.20713,Info not given,Info not given,Info not given,Environmental Information Data Centre (EIDC)
3,10.5285/5321bc6e-be35-4ed3-9b56-25598d61ac8f,Invertebrate activity data from an experiment ...,"['Griffiths, H.M.', 'Ashton, L.A.', 'Walker, A...",2017,"[{'date': '2017-06-29', 'dateType': 'Submitted...",2017-06-29T14:31:55.000Z,587dccf9-8c7f-4593-88f9-7a82cb80cdbe,https://doi.org/10.1111/1365-2656.12728,datacite-crossref,is-cited-by,2019-10-01T19:01:02.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.1111/1365-2656.12728,Ants are the major agents of resource removal ...,"[['Hannah M.', 'Griffiths'], ['Louise A.', 'As...",Wiley,Environmental Information Data Centre (EIDC)
4,10.5285/c4ecfe25-12f2-453b-ad19-49a19e90ee32,Antarctic regional snow accumulation composite...,"['Thomas, Elizabeth']",2017,"[{'date': '2017-07', 'dateType': 'Accepted'}, ...",2017-07-13T10:43:51.000Z,fb23ff88-1148-4505-829f-c5289a5a035a,https://doi.org/10.5194/cp-2017-18,datacite-crossref,is-cited-by,2019-08-01T11:27:16.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5194/cp-2017-18,Review of regional Antarctic snow accumulation...,"[['Elizabeth R.', 'Thomas'], ['J. Melchior', '...",Copernicus GmbH,Polar Data Centre (PDC)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1836,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,fd1a53b9-3c4e-43e7-857a-61add92a61d0,https://gtr.ukri.org/projects?ref=ne%2Fg013187...,datacite-url,is-part-of,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://gtr.ukri.org/projects?ref=ne%2Fg013187...,Info not given,Info not given,Info not given,Polar Data Centre (PDC)
1837,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,a63cb7b5-5caf-4ece-a436-2ffe0c80c652,https://gtr.ukri.org/projects?ref=ne%2Fg014159...,datacite-url,is-part-of,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://gtr.ukri.org/projects?ref=ne%2Fg014159...,Info not given,Info not given,Info not given,Polar Data Centre (PDC)
1838,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,24afe93c-f4c0-4539-8196-8d5c499cdfdf,https://www.bas.ac.uk/project/basal-conditions...,datacite-url,is-part-of,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://www.bas.ac.uk/project/basal-conditions...,Info not given,Info not given,Info not given,Polar Data Centre (PDC)
1839,10.5285/beda45d1-dd33-4666-8861-b4b91af0180f,Simulated changes in East Antarctic mass balan...,"['Jordan, James', 'Stokes, Chris', 'Miles, Ber...",2023,"[{'date': '2020-01-01/2220-01-01', 'dateType':...",2023-02-24T11:53:54.000Z,f04dd43a-d799-4d31-aae9-b13c25357048,https://gtr.ukri.org/projects?ref=ne%2Fr000719...,datacite-url,is-part-of,2023-04-03T10:12:22.000Z,https://api.datacite.org/events?page%5Bcursor%...,https://gtr.ukri.org/projects?ref=ne%2Fr000719...,Info not given,Info not given,Info not given,Polar Data Centre (PDC)


In [None]:
# chat gpt reccommendation
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import RequestException
from urllib3.util.retry import Retry
import numpy as np
import pandas as pd
import time
    
def getDataCiteCitations_relationTypes(relation_type_id_list):

    column_names = ["id", "subj-id", "obj-id", "source-id", "relation-type-id", "occurred-at", "Page endpoint"]
    dataCite_info_relationTypes = []  # create an empty list in which all the DataCite info will be placed
    
    # set up retry mechanism for requests
    retry_strategy = Retry(
        total=5,
        status_forcelist=[429, 500, 502, 503, 504],
        backoff_factor=1
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)

    for relation_type_id in relation_type_id_list:

        dataCite_info = []

        # send a request to get initial info from DataCite
        headers = {
        'prefix': '10.5285',
        'page[cursor]': '1',
        'page[size]': '1000',
        'relation-type-id': relation_type_id
        }

        # retry request a few times before giving up
        for retry_count in range(5):
            try:
                r = http.get('https://api.datacite.org/events', headers=headers)
                r.raise_for_status()
                break
            except RequestException as e:
                print(f"Error occurred while making request: {e}")
                if retry_count < 4:
                    print(f"Retrying request in {2 ** retry_count} seconds...")
                    time.sleep(2 ** retry_count)
                else:
                    raise e

        print(relation_type_id)

        # determine the total number of pages and dataset records
        totalPages = r.json()['meta']['total-pages']
        totalRecords = r.json()['meta']['total']
        print("Total records:", totalRecords)
        print("Total pages:", totalPages)

        # create array from 1 to total number of pages to loop through
        pages = np.arange(1,totalPages+1)
        # set next page url
        if totalPages > 1:
            next_url = r.json()['links']['next']
        else:
            pass
            

        #loop through pages
        for p in pages:
            if p == 1:
                url = 'https://api.datacite.org/events?page[cursor]=1'
            else:
                url = next_url

            # make the API request and print the status code in case of an error
            headers = {'prefix': '10.5285',
                       'page[size]': '1000',
                      '
