https://support.datacite.org/docs/api-get-lists

In [1]:
import requests
import pandas as pd
import numpy as np
import time
from dataCite_fun import getDataCiteCitations_relationTypes, getPublicationInfo_timeCopy

In [None]:
datacite_url = 'https://api.datacite.org/events'
# https://api.datacite.org/events?page[cursor]=1&page[size]=1000
headers = {
    'prefix': '10.5285',
    'page[cursor]': '1',
    'page[size]': '10'
}
r = requests.get(datacite_url, headers)
print(r.status_code)
r.json().keys()
r.json()['data']

In [None]:
# get more info from datacite API 
# hopefully end up with these columns:
# ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'relation_type_id', 'publication_doi', 'publication_type', 'publication_title', 'publication_authors']

pubDOI = '10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359'
r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
print(r.status_code)
print(r.json()['data']['attributes'].keys())
r.json()['data']['attributes']


## Workflow

In [4]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to']

In [5]:
dataCite_df_relationTypes = getDataCiteCitations_relationTypes.getDataCiteCitations_relationTypes(relation_type_id_list)

is-cited-by
Total records: 19
Total pages: 1
Status:  200
Page:  1
Final page
is-referenced-by
Total records: 1188
Total pages: 2
Status:  200
Page:  1
https://api.datacite.org/events?page%5Bcursor%5D=MTY3NTk0MDk2OTEyOCw1NGQ3ZDZhZi04OTdjLTQzNzMtOGJmMi1iMzczY2VjNjljNzc&page%5Bsize%5D=1000&prefix=10.5285&relation-type-id=is-referenced-by
Status:  200
Page:  2
Final page
is-supplement-to
Total records: 8
Total pages: 1
Status:  200
Page:  1
Final page
is-part-of
Total records: 626
Total pages: 1
Status:  200
Page:  1
Final page
Done!


In [6]:
print(dataCite_df_relationTypes.columns)
dataCite_df_relationTypes.head(n=5)

Index(['id', 'subj-id', 'obj-id', 'source-id', 'relation-type-id',
       'occurred-at', 'Page endpoint'],
      dtype='object')


Unnamed: 0,id,subj-id,obj-id,source-id,relation-type-id,occurred-at,Page endpoint
0,26a0d78f-5efe-4ac9-9aed-6654d2a3af61,https://doi.org/10.5285/2641515f-5b76-445c-a93...,https://doi.org/10.1016/j.scitotenv.2012.05.023,datacite-crossref,is-cited-by,2019-08-02T01:05:30.000Z,https://api.datacite.org/events?page%5Bcursor%...
1,8875aad2-cbde-41c3-93b3-d63cb21f4d2d,https://doi.org/10.5285/6feac38a-5847-46f9-84e...,https://doi.org/10.1002/2015gl065750,datacite-crossref,is-cited-by,2019-08-01T11:28:04.000Z,https://api.datacite.org/events?page%5Bcursor%...
2,6fbc8dba-128b-47a1-a137-8acc251cf530,https://doi.org/10.5285/db55406b-c9a1-4a9e-88c...,https://doi.org/10.17863/cam.20713,datacite-related,is-cited-by,2019-10-14T08:32:43.000Z,https://api.datacite.org/events?page%5Bcursor%...
3,587dccf9-8c7f-4593-88f9-7a82cb80cdbe,https://doi.org/10.5285/5321bc6e-be35-4ed3-9b5...,https://doi.org/10.1111/1365-2656.12728,datacite-crossref,is-cited-by,2019-10-01T19:01:02.000Z,https://api.datacite.org/events?page%5Bcursor%...
4,fb23ff88-1148-4505-829f-c5289a5a035a,https://doi.org/10.5285/c4ecfe25-12f2-453b-ad1...,https://doi.org/10.5194/cp-2017-18,datacite-crossref,is-cited-by,2019-08-01T11:27:16.000Z,https://api.datacite.org/events?page%5Bcursor%...


In [7]:
# remove http from DOI url
doi_list = []
for url in dataCite_df_relationTypes['subj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['data_doi'] = doi_list

dataCite_df_relationTypes = dataCite_df_relationTypes.drop(['subj-id'], axis=1)

doi_list = []
for url in dataCite_df_relationTypes['obj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['pub_doi'] = doi_list # rename to work in getPublicationInfo function 
 # they seem to be the wrong way round from datacite - double check this
    
dataCite_df_relationTypes = dataCite_df_relationTypes.rename(columns={"obj-id": "pub_doi_url"})

In [10]:
# drop the rows where the data_doi column value does not start with "10.5285"
dataCite_df_relationTypes = dataCite_df_relationTypes[dataCite_df_relationTypes['data_doi'].str.startswith('10.5285')]

In [15]:
# get dataset metadata 
info_list = []
headers = {'client-id': 'bl.nerc'}
api_url = 'https://api.datacite.org/dois/' 
# for doi in dataCite_df_relationTypes['data_doi']:
for (source_id, relation_type_id, occurred_at, Page_endpoint, data_doi, pub_doi, pub_doi_url) in zip(dataCite_df_relationTypes['source-id'],dataCite_df_relationTypes['relation-type-id'],dataCite_df_relationTypes['occurred-at'],dataCite_df_relationTypes['Page endpoint'],dataCite_df_relationTypes['data_doi'],dataCite_df_relationTypes['pub_doi'], dataCite_df_relationTypes['pub_doi_url']):
    r = requests.get((api_url + data_doi), headers)
    print(r.status_code, data_doi)
    
    try:
        # process author info
        author_list = []
        for item in r.json()['data']['attributes']['creators']:
            author_list.append(item['name'])

        info_list.append([
            r.json()['data']['attributes']['publisher'],
            data_doi,
            r.json()['data']['attributes']['titles'][0]['title'],
            author_list,
            r.json()['data']['attributes']['publicationYear'],
            r.json()['data']['attributes']['dates'],
            r.json()['data']['attributes']['registered'],
            source_id, relation_type_id, pub_doi, pub_doi_url, occurred_at, Page_endpoint
        ])
    except Exception as e:
        info_list.append(["error",data_doi,"error","error","error","error","error","error","error",pub_doi,"error","error","error"])
        
columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered', 
           'source-id', 'relation-type-id', 'pub_doi', 'pub_doi_url', 'occurred-at', 'Page endpoint']
dataCite_df = pd.DataFrame(info_list, columns = columns)    
print("Done!")
    

200 10.5285/2641515f-5b76-445c-a936-1da51bf365ad
200 10.5285/6feac38a-5847-46f9-84e4-e7e9d291f935
200 10.5285/db55406b-c9a1-4a9e-88c2-2abbcb4bcad3
200 10.5285/5321bc6e-be35-4ed3-9b56-25598d61ac8f
200 10.5285/c4ecfe25-12f2-453b-ad19-49a19e90ee32
200 10.5285/1f44795b-e596-433c-b69f-caf674880daa
200 10.5285/bc114581-413a-4c62-adb6-556155901f68
200 10.5285/bc114581-413a-4c62-adb6-556155901f68
200 10.5285/bc114581-413a-4c62-adb6-556155901f68
200 10.5285/9ab1541b-e71a-4d75-9907-968d47755e99
200 10.5285/21a8fca5-8eae-48e4-93c0-bc6b4433e34c
200 10.5285/d77dd930-654b-4d09-99bd-8df3b00025a8
200 10.5285/d77dd930-654b-4d09-99bd-8df3b00025a8
200 10.5285/0f074839-1630-4ccd-aa63-84d0da16b28a
200 10.5285/cc1d42de-dfe6-40aa-a1a6-d45cb2fc8293
200 10.5285/dada63fb-c40a-4b13-97ba-c53860881d79
200 10.5285/3952a4fe-683a-42e7-a074-bdec41c8ab16
200 10.5285/3578bae2-4b88-4b2e-93e1-6965dfe1348c
200 10.5285/a1ab8c79-3426-43a4-ab42-6d1b218d1cc6
200 10.5285/0cf552a6-cd62-4da0-8289-8e4bab0a35a8
200 10.5285/65abc40d

Save the datacite dataframe before getting publication info (as this takes a long time)

In [21]:
dataCite_df.to_csv("dataCite_events_df.csv", index = False)

In [2]:
dataCite_df = pd.read_csv("dataCite_events_df.csv")

In [3]:
dataCite_df_temp = dataCite_df.rename(columns={"pub_doi_url": "subj_id"})
dataCite_df_publication_meta = getPublicationInfo_timeCopy.getPublicationInfo(dataCite_df_temp)

https://doi.org/10.1016/j.scitotenv.2012.05.023 200
https://doi.org/10.1002/2015gl065750 200
https://doi.org/10.17863/cam.20713 200
https://doi.org/10.1111/1365-2656.12728 200
https://doi.org/10.5194/cp-2017-18 200
https://doi.org/10.1002/2016gl068130 200
https://doi.org/10.1029/2007gl032529 200
https://doi.org/10.1029/2009gl040104 200
https://doi.org/10.1029/2009jd012263 200
https://doi.org/10.1002/2015gl065750 200
https://doi.org/10.3189/172756494794587438 200
https://doi.org/10.1029/2007gl032529 200
https://doi.org/10.1029/2009gl040104 200
https://doi.org/10.1111/1365-2656.12798 200
https://doi.org/10.5194/cp-2017-18 200
https://doi.org/10.1029/2018jc013982 200
https://doi.org/10.1175/jcli-d-17-0320.1 200
https://doi.org/10.1111/ele.13129 200
https://doi.org/10.1016/j.jenvrad.2017.06.024 200
https://doi.org/10.1029/2018jc014464 200
https://doi.org/10.5285/4859dc19-e8e9-4148-8c50-cb2ab16dc696 200
https://doi.org/10.5285/65abc40d-e256-414b-8b50-a5569556d1be 200
https://doi.org/10.5285

In [None]:
dataCite_df_publication_meta

In [5]:
#process dataset publisher names
newPublisherLst = []
for dataCentreName in dataCite_df_publication_meta['data_publisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass

    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
dataCite_df_publication_meta['publisher_processed'] = newPublisherLst

dataCite_df_publication_meta = dataCite_df_publication_meta.drop(['data_publisher'], axis=1)
dataCite_df_publication_meta = dataCite_df_publication_meta.rename(columns={'publisher_processed':'data_publisher'})

In [6]:
dataCite_df_publication_meta.to_csv("dataCite_df_events_publication_meta.csv", index = False)

In [7]:
dataCite_df_publication_meta

Unnamed: 0,data_doi,data_title,data_authors,publicationYear,dates,registered,source-id,relation-type-id,pub_doi,subj_id,occurred-at,Page endpoint,pub_Title,pub_authors,publisher,data_publisher
0,10.5285/2641515f-5b76-445c-a936-1da51bf365ad,Exposure of burrowing mammals to Radon Rn-222 ...,"['Beresford, N.A.', 'Barnett, C.L.', 'Vives I ...",2012,"[{'date': '2012-05-29', 'dateType': 'Submitted...",2012-05-29T13:58:37.000Z,datacite-crossref,is-cited-by,10.1016/j.scitotenv.2012.05.023,https://doi.org/10.1016/j.scitotenv.2012.05.023,2019-08-02T01:05:30.000Z,https://api.datacite.org/events?page%5Bcursor%...,Exposure of burrowing mammals to 222Rn,"[[N.A., Beresford], [C.L., Barnett], [J., Vive...",Elsevier BV,Environmental Information Data Centre (EIDC)
1,10.5285/6feac38a-5847-46f9-84e4-e7e9d291f935,Snow accumulation from the Bryan Coast ice cor...,"['Thomas, Elizabeth']",2017,"[{'date': '2010', 'dateType': 'Collected'}, {'...",2017-07-10T16:43:38.000Z,datacite-crossref,is-cited-by,10.1002/2015gl065750,https://doi.org/10.1002/2015gl065750,2019-08-01T11:28:04.000Z,https://api.datacite.org/events?page%5Bcursor%...,Twentieth century increase in snowfall in coas...,"[[E. R., Thomas], [J. S., Hosking], [R. R., Tu...",American Geophysical Union (AGU),Polar Data Centre (PDC)
2,10.5285/db55406b-c9a1-4a9e-88c2-2abbcb4bcad3,Foraging behaviour of Parus major held in temp...,"['Thorogood, R', 'Kokko, H', 'Mappes, J']",2017,"[{'date': '2017-10-20', 'dateType': 'Submitted...",2017-10-20T09:10:28.000Z,datacite-related,is-cited-by,10.17863/cam.20713,https://doi.org/10.17863/cam.20713,2019-10-14T08:32:43.000Z,https://api.datacite.org/events?page%5Bcursor%...,Info not given,Info not given,Info not given,Environmental Information Data Centre (EIDC)
3,10.5285/5321bc6e-be35-4ed3-9b56-25598d61ac8f,Invertebrate activity data from an experiment ...,"['Griffiths, H.M.', 'Ashton, L.A.', 'Walker, A...",2017,"[{'date': '2017-06-29', 'dateType': 'Submitted...",2017-06-29T14:31:55.000Z,datacite-crossref,is-cited-by,10.1111/1365-2656.12728,https://doi.org/10.1111/1365-2656.12728,2019-10-01T19:01:02.000Z,https://api.datacite.org/events?page%5Bcursor%...,Ants are the major agents of resource removal ...,"[[Hannah M., Griffiths], [Louise A., Ashton], ...",Wiley,Environmental Information Data Centre (EIDC)
4,10.5285/c4ecfe25-12f2-453b-ad19-49a19e90ee32,Antarctic regional snow accumulation composite...,"['Thomas, Elizabeth']",2017,"[{'date': '2017-07', 'dateType': 'Accepted'}, ...",2017-07-13T10:43:51.000Z,datacite-crossref,is-cited-by,10.5194/cp-2017-18,https://doi.org/10.5194/cp-2017-18,2019-08-01T11:27:16.000Z,https://api.datacite.org/events?page%5Bcursor%...,Review of regional Antarctic snow accumulation...,"[[Elizabeth R., Thomas], [J. Melchior, van Wes...",Copernicus GmbH,Polar Data Centre (PDC)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,datacite-url,is-part-of,https://gtr.ukri.org/projects?ref=ne%2Fg013187...,https://gtr.ukri.org/projects?ref=ne%2Fg013187...,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,not a doi,not a doi,not a doi,Polar Data Centre (PDC)
1763,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,datacite-url,is-part-of,https://gtr.ukri.org/projects?ref=ne%2Fg014159...,https://gtr.ukri.org/projects?ref=ne%2Fg014159...,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,not a doi,not a doi,not a doi,Polar Data Centre (PDC)
1764,10.5285/6fcc17ad-425b-4367-bd23-c4133a38e359,"Shear wave splitting catalogue, Rutford Ice St...","['Kufner, Sofia-Katerina', 'Brisbourne, Alex',...",2022,"[{'date': '2018-11-20/2019-02-16', 'dateType':...",2022-06-24T11:04:24.000Z,datacite-url,is-part-of,https://www.bas.ac.uk/project/basal-conditions...,https://www.bas.ac.uk/project/basal-conditions...,2023-03-14T11:30:50.000Z,https://api.datacite.org/events?page%5Bcursor%...,not a doi,not a doi,not a doi,Polar Data Centre (PDC)
1765,10.5285/beda45d1-dd33-4666-8861-b4b91af0180f,Simulated changes in East Antarctic mass balan...,"['Jordan, James', 'Stokes, Chris', 'Miles, Ber...",2023,"[{'date': '2020-01-01/2220-01-01', 'dateType':...",2023-02-24T11:53:54.000Z,datacite-url,is-part-of,https://gtr.ukri.org/projects?ref=ne%2Fr000719...,https://gtr.ukri.org/projects?ref=ne%2Fr000719...,2023-04-03T10:12:22.000Z,https://api.datacite.org/events?page%5Bcursor%...,not a doi,not a doi,not a doi,Polar Data Centre (PDC)
