### NERC dataset citations - DataCite
Code to collect NERC dataset DOIs and citation information from the DataCite API

In [1]:

import requests, time, json, re, datetime, os, sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import numpy as np
import pandas as pd
from math import ceil
from datetime import date
# from dataCite_fun import getDataCiteCitations_relationTypes, getPublicationInfo_timeCopy
# import exportCitationResultsToCsv
# from Results import convertCSVtoJSON


from citations_fun.getDataCiteCitations_relationTypes import getDataCiteCitations_relationTypes
from citations_fun.getPublicationInfo_forDataCite import getPublicationInfo


## DataCite events API
Get citation event data from datacite

In [2]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to', 'IsPartOf', 'IsContinuedBy', 'IsDescribedBy', 'IsDocumentedBy', 'IsDerivedFrom', 'IsRequiredBy']
dataCite_df_relationTypes = getDataCiteCitations_relationTypes(relation_type_id_list)

is-cited-by
Total records: 211
Total pages: 1
Status:  200
Page:  1
Final page
is-referenced-by
Total records: 2509
Total pages: 3
Status:  200
Page:  1
https://api.datacite.org/events?page%5Bcursor%5D=MTY3NTk0MDk2OTEyOCw4NTU1Nzg1NC00ZTBjLTRjZmMtODlkYS05ZDVmMTg0NjAzYzI&page%5Bsize%5D=1000&prefix=10.5285&relation-type-id=is-referenced-by
Status:  200
Page:  2
https://api.datacite.org/events?page%5Bcursor%5D=MTc0MTc5MzY1NzY3MSw2YTBmMWVhNi01NjFmLTQwN2YtYjllZC1iMWQxNzU0NmZlMTM&page%5Bsize%5D=1000&prefix=10.5285&relation-type-id=is-referenced-by
Status:  200
Page:  3
Final page
is-supplement-to
Total records: 15
Total pages: 1
Status:  200
Page:  1
Final page
IsPartOf
Total records: 0
Total pages: 0
IsContinuedBy
Total records: 0
Total pages: 0
IsDescribedBy
Total records: 0
Total pages: 0
IsDocumentedBy
Total records: 0
Total pages: 0
IsDerivedFrom
Total records: 0
Total pages: 0
IsRequiredBy
Total records: 0
Total pages: 0
Done!


In [3]:
# join events data with dataset DOI metadata

### don't need to load file if done in same env as getNERCDataDOIs() ###
with open("results/nerc_datacite_dois.json") as f:
    nerc_datacite_dois = json.load(f)

nerc_datacite_dois_df = pd.DataFrame(nerc_datacite_dois)

datacite_doi_events_df = dataCite_df_relationTypes.merge(
    nerc_datacite_dois_df,
    left_on='data_doi',
    right_on='data_doi',
    how='left'  # left-join as dataCite_df_relationTypes is a subset of nerc_datacite_dois_df
)


datacite_doi_events_df_drop = datacite_doi_events_df.drop(['data_page_number', 'data_self_link'], axis = 1)

# re-order
datacite_doi_events_df_drop = datacite_doi_events_df_drop[[
    'data_doi', 'data_publisher', 'data_title', 'data_publication_year', 'data_authors',
    'relation_type', 'pub_doi', 'source_id'
]]


# output results file 
# today = date.today()
# dataCite_filename = "results/" + 'datacite_doi_events_df_retrieved_' + (today.strftime("%d%m%Y")) + '.csv'
datacite_doi_events_df_drop.to_csv("results/latest_results_dataCite.csv", index = False)

In [None]:
# takes 15 mins
# get pub title, authors and date
dataCite_df_pubInfo = getPublicationInfo(datacite_doi_events_df_drop)

0 https://doi.org/10.1016/j.scitotenv.2012.05.023 200
1 https://doi.org/10.1002/2015gl065750 200
2 https://doi.org/10.17863/cam.20713 200
3 https://doi.org/10.1111/1365-2656.12728 200
4 https://doi.org/10.5194/cp-2017-18 200
5 https://doi.org/10.1002/2016gl068130 200
6 https://doi.org/10.1029/2007gl032529 200
7 https://doi.org/10.1029/2009gl040104 200
8 https://doi.org/10.1029/2009jd012263 200
9 https://doi.org/10.1002/2015gl065750 200
10 https://doi.org/10.3189/172756494794587438 200
11 https://doi.org/10.1029/2007gl032529 200
12 https://doi.org/10.1029/2009gl040104 200
13 https://doi.org/10.1111/1365-2656.12798 200
14 https://doi.org/10.5194/cp-2017-18 200
15 https://doi.org/10.1029/2018jc013982 200
16 https://doi.org/10.1175/jcli-d-17-0320.1 200
17 https://doi.org/10.1111/ele.13129 200
18 https://doi.org/10.1016/j.jenvrad.2017.06.024 200
19 https://doi.org/10.1029/2018jc014464 200
20 https://doi.org/10.5285/4859dc19-e8e9-4148-8c50-cb2ab16dc696 200
21 https://doi.org/10.5285/65abc40d

In [None]:
dataCite_df_pubInfo.to_csv("results/latest_results_dataCite_publicationInfo.csv", index= False )


In [12]:
dataCite_df_pubInfo = merged_df

In [16]:
with pd.option_context('display.max_colwidth', None):
    print(dataCite_df_pubInfo.iloc[50])

data_doi                                                                                                               10.5285/fdf8c8d3-5998-45a5-8431-7f5e6302fc32
data_publisher                                                                                                         Environmental Information Data Centre (EIDC)
data_title                                                                                               Land Cover Map 2007 (1km percentage target class, GB) v1.2
data_publication_year                                                                                                                                        2014.0
data_authors                                                                        [Morton, R.D., Rowland, C.S., Wood, C.M., Meek, L., Marston, C.G., Smith, G.M.]
relation_type                                                                                                                                      is-referenced-by
pub_doi         

In [27]:
date = dataCite_df_pubInfo['pub_date'].iloc[50]

yr = date[0][0]
month = date[0][1]
day = date[0][2]

pub_date  = f"{date[0][2]}/{date[0][1]}/{date[0][0]}"
print(pub_date)


21/3/2016


Don't need stuff below here:

In [None]:
datacite_doi_events_df_drop.head()

In [None]:
# don't need anymore as not using json?
key_to_column_mapping = {
    "publisher": "data_publisher",
    "data_doi": "data_doi",
    "title": "data_title",
    "authors": "data_authors",
    "publicationYear": "publicationYear",
    "dates": "dates",
    "source-id": "source-id",
    "relation-type-id": "relation-type-id",
    "pub_doi_url": "subj-id",
    "occurred-at": "occurred-at",
    "Page endpoint": "Page endpoint"
}

# need to delete datasetDOI_attribute at some point?
# renamed_json.pop('datasetDOI_attribute', None)


# # Rename the keys in merged_json according to the mapping
# renamed_json = {key_to_column_mapping.get(k, k): v for k, v in merged_json[0].items()}
# renamed_json

def rename_keys(dict, mapping):
    return {mapping.get(k, k): v for k, v in dict.items()}

renamed_json = [rename_keys(item, key_to_column_mapping) for item in merged_json]
datacite_json = renamed_json

    


In [None]:
# # get dataset metadata - old - get rid?
# info_list = []
# headers = {'client-id': 'bl.nerc'}
# api_url = 'https://api.datacite.org/dois/' 
# # for doi in dataCite_df_relationTypes['data_doi']:
# for (source_id, relation_type_id, occurred_at, Page_endpoint, data_doi, pub_doi_url) in zip(dataCite_df_relationTypes['source-id'],dataCite_df_relationTypes['relation-type-id'],dataCite_df_relationTypes['occurred-at'],dataCite_df_relationTypes['Page endpoint'],dataCite_df_relationTypes['data_doi'], dataCite_df_relationTypes['pub_doi_url']):
#     r = requests.get((api_url + data_doi), headers)
#     print(r.status_code, data_doi)
    
#     try:
#         # process author info
#         author_list = []
#         for item in r.json()['data']['attributes']['creators']:
#             author_list.append(item['name'])

#         info_list.append([
#             r.json()['data']['attributes']['publisher'],
#             data_doi,
#             r.json()['data']['attributes']['titles'][0]['title'],
#             author_list,
#             r.json()['data']['attributes']['publicationYear'],
#             r.json()['data']['attributes']['dates'],
#             r.json()['data']['attributes']['registered'],
#             source_id, relation_type_id, pub_doi_url, occurred_at, Page_endpoint
#         ])
#     except Exception as e:
#         info_list.append(["error",data_doi,"error","error","error","error","error","error","error",pub_doi_url,"error","error"])
        
# columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered', 
#            'source-id', 'relation-type-id', 'pub_doi_url', 'occurred-at', 'Page endpoint']
# dataCite_df = pd.DataFrame(info_list, columns = columns)    
# print("Done!")
    

In [None]:
import importlib

importlib.reload(getPublicationInfo_timeCopy)