### NERC dataset citations - DataCite
Code to collect NERC dataset DOIs and citation information from the DataCite API

In [1]:

import requests, time, json, re, datetime, os, sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import numpy as np
import pandas as pd
from math import ceil
from datetime import date
# from dataCite_fun import getDataCiteCitations_relationTypes, getPublicationInfo_timeCopy
# import exportCitationResultsToCsv
# from Results import convertCSVtoJSON


from citations_fun.getDataCiteCitations_relationTypes import getDataCiteCitations_relationTypes


## DataCite events API
Get citation event data from datacite

In [2]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to', 'IsPartOf', 'IsContinuedBy', 'IsDescribedBy', 'IsDocumentedBy', 'IsDerivedFrom', 'IsRequiredBy']
dataCite_df_relationTypes = getDataCiteCitations_relationTypes(relation_type_id_list)

is-cited-by
Total records: 211
Total pages: 1
Status:  200
Page:  1
Final page
is-referenced-by
Total records: 2498
Total pages: 3
Status:  200
Page:  1
https://api.datacite.org/events?page%5Bcursor%5D=MTY3NTk0MDk2OTEyOCw4NTU1Nzg1NC00ZTBjLTRjZmMtODlkYS05ZDVmMTg0NjAzYzI&page%5Bsize%5D=1000&prefix=10.5285&relation-type-id=is-referenced-by
Status:  200
Page:  2
https://api.datacite.org/events?page%5Bcursor%5D=MTc0MTc5MzY1NzY3MSw2YTBmMWVhNi01NjFmLTQwN2YtYjllZC1iMWQxNzU0NmZlMTM&page%5Bsize%5D=1000&prefix=10.5285&relation-type-id=is-referenced-by
Status:  200
Page:  3
Final page
is-supplement-to
Total records: 12
Total pages: 1
Status:  200
Page:  1
Final page
IsPartOf
Total records: 0
Total pages: 0
IsContinuedBy
Total records: 0
Total pages: 0
IsDescribedBy
Total records: 0
Total pages: 0
IsDocumentedBy
Total records: 0
Total pages: 0
IsDerivedFrom
Total records: 0
Total pages: 0
IsRequiredBy
Total records: 0
Total pages: 0
Done!


In [3]:
# join events data with dataset DOI metadata

### don't need to load file if done in same env as getNERCDataDOIs() ###
with open("results/nerc_datacite_dois.json") as f:
    nerc_datacite_dois = json.load(f)

nerc_datacite_dois_df = pd.DataFrame(nerc_datacite_dois)

datacite_doi_events_df = dataCite_df_relationTypes.merge(
    nerc_datacite_dois_df,
    left_on='data_doi',
    right_on='data_doi',
    how='left'  # left-join as dataCite_df_relationTypes is a subset of nerc_datacite_dois_df
)

# # Convert the merged df to json
# merged_json = datacite_doi_events_df.to_dict(orient='records')




In [4]:
datacite_doi_events_df.head()


Unnamed: 0,id,pub_doi,source-id,relation-type-id,occurred-at,Page endpoint,data_doi,data_publisher,data_title,data_dates,data_publication_year,data_authors,data_registered,data_page_number,data_self_link
0,26a0d78f-5efe-4ac9-9aed-6654d2a3af61,https://doi.org/10.1016/j.scitotenv.2012.05.023,datacite-crossref,is-cited-by,2019-08-02T01:05:30.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5285/2641515f-5b76-445c-a936-1da51bf365ad,Environmental Information Data Centre (EIDC),Exposure of burrowing mammals to Radon Rn-222 ...,"[{'date': '2012-05-29', 'dateType': 'Submitted...",2012.0,"[Beresford, N.A., Barnett, C.L., Vives i Batll...",2012-05-29T13:58:37Z,39.0,https://api.datacite.org/dois?client-id=bl.ner...
1,8875aad2-cbde-41c3-93b3-d63cb21f4d2d,https://doi.org/10.1002/2015gl065750,datacite-crossref,is-cited-by,2019-08-01T11:28:04.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5285/6feac38a-5847-46f9-84e4-e7e9d291f935,Polar Data Centre (PDC),Snow accumulation from the Bryan Coast ice cor...,"[{'date': '2010', 'dateType': 'Collected'}, {'...",2017.0,"[Thomas, Elizabeth]",2017-07-10T16:43:38Z,184.0,https://api.datacite.org/dois?client-id=bl.ner...
2,6fbc8dba-128b-47a1-a137-8acc251cf530,https://doi.org/10.17863/cam.20713,datacite-related,is-cited-by,2019-10-14T08:32:43.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5285/db55406b-c9a1-4a9e-88c2-2abbcb4bcad3,Environmental Information Data Centre (EIDC),Foraging behaviour of Parus major held in temp...,"[{'date': '2017-10-20', 'dateType': 'Submitted...",2017.0,"[Thorogood, R, Kokko, H, Mappes, J]",2017-10-20T09:10:28Z,56.0,https://api.datacite.org/dois?client-id=bl.ner...
3,587dccf9-8c7f-4593-88f9-7a82cb80cdbe,https://doi.org/10.1111/1365-2656.12728,datacite-crossref,is-cited-by,2019-10-01T19:01:02.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5285/5321bc6e-be35-4ed3-9b56-25598d61ac8f,Environmental Information Data Centre (EIDC),Invertebrate activity data from an experiment ...,"[{'date': '2017-06-29', 'dateType': 'Submitted...",2017.0,"[Griffiths, H.M., Ashton, L.A., Walker, A.E., ...",2017-06-29T14:31:55Z,67.0,https://api.datacite.org/dois?client-id=bl.ner...
4,fb23ff88-1148-4505-829f-c5289a5a035a,https://doi.org/10.5194/cp-2017-18,datacite-crossref,is-cited-by,2019-08-01T11:27:16.000Z,https://api.datacite.org/events?page%5Bcursor%...,10.5285/c4ecfe25-12f2-453b-ad19-49a19e90ee32,Polar Data Centre (PDC),Antarctic regional snow accumulation composite...,"[{'date': '2017-07', 'dateType': 'Accepted'}, ...",2017.0,"[Thomas, Elizabeth]",2017-07-13T10:43:51Z,178.0,https://api.datacite.org/dois?client-id=bl.ner...


In [6]:
datacite_doi_events_df.columns

Index(['id', 'pub_doi_url', 'source-id', 'relation-type-id', 'occurred-at',
       'Page endpoint', 'data_doi', 'data_publisher', 'data_title',
       'data_dates', 'data_publication_year', 'data_authors',
       'data_registered', 'data_page_number', 'data_self_link'],
      dtype='object')

In [None]:
### process publisher names -  this is now done in the initital datacite DOI collection
# from citations_fun.helper_fun import process_dataset_publisher_names
# datacite_doi_events_df = process_dataset_publisher_names(datacite_doi_events_df)

In [5]:
# output results file 
# today = date.today()
# dataCite_filename = "results/" + 'datacite_doi_events_df_retrieved_' + (today.strftime("%d%m%Y")) + '.csv'
datacite_doi_events_df.to_csv("results/latest_results_dataCite.csv", index = False)

In [None]:
# don't need anymore as not using json?
key_to_column_mapping = {
    "publisher": "data_publisher",
    "data_doi": "data_doi",
    "title": "data_title",
    "authors": "data_authors",
    "publicationYear": "publicationYear",
    "dates": "dates",
    "source-id": "source-id",
    "relation-type-id": "relation-type-id",
    "pub_doi_url": "subj-id",
    "occurred-at": "occurred-at",
    "Page endpoint": "Page endpoint"
}

# need to delete datasetDOI_attribute at some point?
# renamed_json.pop('datasetDOI_attribute', None)


# # Rename the keys in merged_json according to the mapping
# renamed_json = {key_to_column_mapping.get(k, k): v for k, v in merged_json[0].items()}
# renamed_json

def rename_keys(dict, mapping):
    return {mapping.get(k, k): v for k, v in dict.items()}

renamed_json = [rename_keys(item, key_to_column_mapping) for item in merged_json]
datacite_json = renamed_json

    


In [None]:
# # get dataset metadata - old - get rid?
# info_list = []
# headers = {'client-id': 'bl.nerc'}
# api_url = 'https://api.datacite.org/dois/' 
# # for doi in dataCite_df_relationTypes['data_doi']:
# for (source_id, relation_type_id, occurred_at, Page_endpoint, data_doi, pub_doi_url) in zip(dataCite_df_relationTypes['source-id'],dataCite_df_relationTypes['relation-type-id'],dataCite_df_relationTypes['occurred-at'],dataCite_df_relationTypes['Page endpoint'],dataCite_df_relationTypes['data_doi'], dataCite_df_relationTypes['pub_doi_url']):
#     r = requests.get((api_url + data_doi), headers)
#     print(r.status_code, data_doi)
    
#     try:
#         # process author info
#         author_list = []
#         for item in r.json()['data']['attributes']['creators']:
#             author_list.append(item['name'])

#         info_list.append([
#             r.json()['data']['attributes']['publisher'],
#             data_doi,
#             r.json()['data']['attributes']['titles'][0]['title'],
#             author_list,
#             r.json()['data']['attributes']['publicationYear'],
#             r.json()['data']['attributes']['dates'],
#             r.json()['data']['attributes']['registered'],
#             source_id, relation_type_id, pub_doi_url, occurred_at, Page_endpoint
#         ])
#     except Exception as e:
#         info_list.append(["error",data_doi,"error","error","error","error","error","error","error",pub_doi_url,"error","error"])
        
# columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered', 
#            'source-id', 'relation-type-id', 'pub_doi_url', 'occurred-at', 'Page endpoint']
# dataCite_df = pd.DataFrame(info_list, columns = columns)    
# print("Done!")
    

In [None]:
import importlib

importlib.reload(getPublicationInfo_timeCopy)

<module 'dataCite_fun.getPublicationInfo_timeCopy' from 'c:\\Users\\matnic\\OneDrive - UKCEH\\Projects\\DataCentre Citations\\dataCite_fun\\getPublicationInfo_timeCopy.py'>