## Workflow for retrieving citations of datasets held by NERC data centres from CrossRef database


In [1]:
import sys
sys.path.insert(0, '..')

# module to run event data queries
import crossRef_fun
import os # some file manipulations
from crossRef_fun import getDataCiteInfo, getCrossRefCitations, filterCrossRefResults, mergeDFs, getPublicationInfo
from math import ceil
import json
import pandas as pd
import time
import re
import requests
import datetime

In [2]:
# Code for getting results based on a date range

email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "1990-01-01"
end_date = "2023-04-19"
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/"
results_folder_path_name = results_folder_path + "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date

getCrossRefCitations.getCrossRefCitations_byDates(email, prefix, start_date, end_date, results_folder_path)

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&rows=1000&obj-id.prefix=10.5285&from-occurred-date=1990-01-01&until-occurred-date=2023-04-19
Event Data query started...
API query complete  200
output file written to C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/event_data_10.5285_1990-01-01_2023-04-19.json
44369 events found
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=2162cf3a-79be-4bd3-aaa7-1bdb682872b5&rows=1000&obj-id.prefix=10.5285&from-occurred-date=1990-01-01&until-occurred-date=2023-04-19
Event Data query started...
API query complete  200
output file written to C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/NERC_EDS_events_from_1990-01-01_up_to_2023-04-19/page0000.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=ec36cf84-461c-40bb-8342-ad101145241a&rows=1000&obj-id.prefix=10.5285&from-occurred-date=1990-01-01&until-occurred-date=2023-04-19
Event Dat

In [None]:
# Code for getting results based on a list of DOIs rather than a date range

email = "matnic@ceh.ac.uk"

# import list of DOIs to find event data for
doi_list = pd.read_csv("marika_doi_list.csv", header = None)[0].tolist()

# set up folder to put result jsons into
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/"
results_folder_name = "NERC_EDS_events_from_doi_list_scholix"
results_folder_path_name = results_folder_path + results_folder_name

getCrossRefCitations.getCrossRefCitations_byDOI(email, doi_list, results_folder_path_name)

In [4]:
# filter results 
crossRef_df_gbif_filtered2_deduplicated = filterCrossRefResults.filterCrossRefResults(results_folder_path_name)

Pass citation info to datacite API to collect relevant info on the datasets, data centres etc

In [5]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors, dataCite_df) = getDataCiteInfo.getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

https://doi.org/10.5285/6c6c9203-7333-4d96-88ab-78925e7a4e73 https://doi.org/10.1007/s11368-018-1990-7
API response:  200
https://doi.org/10.5285/4c7fdfa6-f176-4c58-acee-683d5e9d2ed5 https://doi.org/10.5194/gmd-11-1377-2018
API response:  200
https://doi.org/10.5285/18BE23F8-D252-482D-8AF9-5D6A2D40990C https://doi.org/10.1007/s00704-017-2246-y
API response:  200
https://doi.org/10.5285/e1d33b37-f1d4-4234-a0d5-8bf4e657f653 https://doi.org/10.1007/s00484-018-1509-3
API response:  200
https://doi.org/10.5285/4c9613ce-de52-41b1-9fde-7c41f9199686 https://doi.org/10.1007/s00484-018-1509-3
API response:  200
https://doi.org/10.5285/58a8802721c94c66ae45c3baa4d814d0 https://doi.org/10.1007/s00704-018-2476-7
API response:  200
https://doi.org/10.5285/475520d5-bad9-4d84-e053-6c86abc0b01b https://doi.org/10.1007/s10533-017-0350-9
API response:  200
https://doi.org/10.5285/bad1514f-119e-44a4-8e1e-442735bb9797 https://doi.org/10.1007/s11269-018-1914-8
API response:  200
https://doi.org/10.5285/33604

In [6]:
errors
dataCite_df

Unnamed: 0,dataset_DOI,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed
0,https://doi.org/10.5285/6c6c9203-7333-4d96-88a...,"Land Cover Map 2015 (vector, GB)","[Rowland, C.S., Morton, R.D., Carrasco, L., Mc...",2017-04-12,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",61,"[{'id': '10.1007/s11368-018-1990-7', 'type': '...",Environmental Information Data Centre (EIDC)
1,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
2,https://doi.org/10.5285/18BE23F8-D252-482D-8AF...,CRU TS3.22: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2014,Issued,[],5,"[{'id': '10.3390/atmos13030421', 'type': 'dois...",Centre for Environmental Data Analysis (CEDA)
3,https://doi.org/10.5285/e1d33b37-f1d4-4234-a0d...,UK Environmental Change Network (ECN) meteorol...,"[Rennie, S., Adamson, J., Anderson, R., Andrew...",2015-05-14,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",11,"[{'id': '10.1016/j.geoderma.2017.08.011', 'typ...",Environmental Information Data Centre (EIDC)
4,https://doi.org/10.5285/4c9613ce-de52-41b1-9fd...,UK Environmental Change Network (ECN) carabid ...,"[Rennie, S., Adamson, J., Anderson, R., Andrew...",2015-02-25,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",6,"[{'id': '10.1007/s00484-018-1509-3', 'type': '...",Environmental Information Data Centre (EIDC)
...,...,...,...,...,...,...,...,...,...
691,https://doi.org/10.5285/8344E4F3-D2EA-44F5-8AF...,Catchment attributes and hydro-meteorological ...,"[Coxon, G., Addor, N., Bloomfield, J.P., Freer...",2020-02-20,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",8,"[{'id': '10.5194/hess-26-1507-2022', 'type': '...",Environmental Information Data Centre (EIDC)
692,https://doi.org/10.5285/e38c58a6-48ec-4ad1-a99...,UKGEOS Glasgow GGC01 Final Borehole Informatio...,"[Monaghan, Alison, Damaschke, Magret, Starcher...",2021-07-29,Available,[],0,[],National Geoscience Data Centre (NGDC)
693,https://doi.org/10.5285/cb336b72-1978-4319-a59...,UKGEOS Cheshire Ince Marshes BHA-101 Core Scan...,[BGS Core Scanning Facility],2022-09-06,Available,[],0,[],National Geoscience Data Centre (NGDC)
694,https://doi.org/10.5285/b06d44e6-324d-4e19-bf7...,UKGEOS Cheshire TH0424 Initial Core Scanning D...,[BGS Core Scanning Facility],2022-03-15,Available,[],0,[],National Geoscience Data Centre (NGDC)


In [None]:
# # What to do with publication type information? if posted content probably want to ignore?
# crossRef_df_gbif_filtered2_deduplicated['obj_id']

# # for each dataset count each type of publication that cites it
# for data_doi in crossRef_df_processed


In [7]:
(dataset_df, crossRef_df_gbif_filtered2_deduplicated) = mergeDFs.mergeDFs(dataCite_df,crossRef_df_gbif_filtered2_deduplicated)

In [8]:
crossRef_df_gbif_filtered2_deduplicated

Unnamed: 0,relation_type_id,source_id,obj_id,subj_id,subj_work_type_id,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed
0,references,crossref,https://doi.org/10.5285/6c6c9203-7333-4d96-88a...,https://doi.org/10.1007/s11368-018-1990-7,Info not given,"Land Cover Map 2015 (vector, GB)","[Rowland, C.S., Morton, R.D., Carrasco, L., Mc...",2017-04-12,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",61.0,"[{'id': '10.1007/s11368-018-1990-7', 'type': '...",Environmental Information Data Centre (EIDC)
1,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.5194/gmd-11-1377-2018,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
2,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.5194/gmd-11-1343-2018,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
3,references,crossref,https://doi.org/10.5285/18BE23F8-D252-482D-8AF...,https://doi.org/10.1007/s00704-017-2246-y,Info not given,CRU TS3.22: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2014,Issued,[],5.0,"[{'id': '10.3390/atmos13030421', 'type': 'dois...",Centre for Environmental Data Analysis (CEDA)
4,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.1002/2017WR021682,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7081,references,crossref,https://doi.org/10.5285/b06d44e6-324d-4e19-bf7...,https://doi.org/10.1144/sp527-2022-58,journal-article,UKGEOS Cheshire TH0424 Initial Core Scanning D...,[BGS Core Scanning Facility],2022-03-15,Available,[],0.0,[],National Geoscience Data Centre (NGDC)
7082,references,crossref,https://doi.org/10.5285/a29c5465-b138-234d-e05...,https://doi.org/10.1007/s13157-021-01443-4,journal-article,The GEBCO_2020 Grid - a continuous terrain mod...,[GEBCO Bathymetric Compilation Group 2020],2019-04/2020-03,Collected,[],56.0,"[{'id': '10.5194/essd-13-2165-2021', 'type': '...",British Oceanographic Data Centre (BODC)
7100,references,crossref,https://doi.org/10.5285/f91cd3ee7b6243d5b7d41b...,https://doi.org/10.1175/jtech-d-21-0179.1,journal-article,ESA Sea State Climate Change Initiative (Sea_S...,"[Piollé, Jean-François, Dodet, Guillaume, Quil...",2020-01-30,Updated,"[{'relationType': 'IsPartOf', 'relatedIdentifi...",1.0,"[{'id': '10.3389/fenrg.2022.929625', 'type': '...",Centre for Environmental Data Analysis (CEDA)
7101,references,crossref,https://doi.org/10.5285/3b602f74-8374-1e90-e05...,https://doi.org/10.1038/s41558-021-01127-1,journal-article,GESLA (Global Extreme Sea Level Analysis) high...,"[Woodworth, Philip L, Hunter, John R, Marcos M...",1846-01-04/2015-05-01,Collected,[],1.0,"[{'id': '10.5194/gmd-15-2035-2022', 'type': 'd...",British Oceanographic Data Centre (BODC)


In [None]:
# # pass publication DOIs to DOI.org to determine type of publication using checkDOIpubType function defined above
# pubTypeList = []
# pubDOI = []

# for count, doi in enumerate(crossRef_df_processed['subj_id']):
#     doiComponents = doi.split('/')[-2:]
#     doi = doiComponents[0] + "/" + doiComponents[1]
#     # print(type(doi))
#     pubType = checkDOIpubType(doi)
#     pubTypeList.append(pubType)
#     pubDOI.append(doi)
    
#     # add code to catch retries limit exceeded - might need to be in function itself?
#     # e.g. https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
    
#     time.sleep(0.5)
#     # if count % 200 == 0: # if count is a multiple of 200 wait for a bit
#     #         time.sleep(180)

# print('Done!')

# crossRef_df_processed['publicationType'] = pubTypeList
# crossRef_df_processed['publication_DOI'] = pubDOI


# # split publicationType column into publicationType1 and publicationSubType columns
# crossRef_df_processed[['publicationType1','publicationSubType']] = pd.DataFrame(crossRef_df_processed['publicationType'].tolist(), index=crossRef_df_processed.index)


# # determine publication type for records where DOI.org API call failed - ~10 mins
# newPubTypeList = []

# # get the unknown rows from scholex_df pubtype column and the pubDOI
# for pubType, pubDOI in zip(crossRef_df_processed['publicationType1'],crossRef_df_processed['publication_DOI']):
    
#     if pubType == 'unknown':

#         # need a catcher to make sure pubDOI is a single ID - in the cases where there was no DOI there is more than one ID recorded
#         if len(pubDOI) < 100: # if the length is less than 100 (arbitrarily) then it will be a single DOI
#             pass
#         else: # if there is no DOI skip this record
#             newPubTypeList.append(pubType) # leave it the same
#             #newPubType = pubType 
#             continue
        
#         # determine if crossref or datacite supplies the DOI
#         print('Pub DOI: ', pubDOI)
#         r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
#         DOIregistry = r.json()[0]['RA']
#         print(DOIregistry)

#         # query the crossref or datacite API
#         if DOIregistry == 'DataCite':
#             # ask the Datacite API what type of publication
#             r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

#         elif DOIregistry == 'Crossref':
#             r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['message']['type']) # could also add 'subtype':r.json()['message']['subtype']
#         else:
#             print('Unknown DOI registry')
        
#     else: 
#         newPubTypeList.append(pubType) # in the cases where the pubType is not unknown keep it the same

# crossRef_df_processed['newPubTypeList'] = newPubTypeList
# print("Done")

In [None]:
pubDOI = '10.1007/s11368-018-1990-7'
r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
DOIregistry = r.json()[0]['RA']
print(DOIregistry)

In [None]:
# query the crossref or datacite API
if DOIregistry == 'DataCite':
    # ask the Datacite API what type of publication
    r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
    print(r.status_code)
    newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

In [None]:
r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
print(r.status_code)
r.json()['message']

In [None]:

# add a section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
doi_url = 'https://doi.org/10.5194/gmd-11-1377-2018'
print(doi_url)
r = requests.get(doi_url, headers={"Accept": "application/json"}) # sometimes throws up unexpected errors

# sometimes the API call returns info in a format that is not JSON 
r.json()

In [9]:
# slow if df is large
crossRef_df_gbif_filtered2_deduplicated = getPublicationInfo.getPublicationInfo(crossRef_df_gbif_filtered2_deduplicated)

https://doi.org/10.1007/s11368-018-1990-7
https://doi.org/10.5194/gmd-11-1377-2018
https://doi.org/10.5194/gmd-11-1343-2018
https://doi.org/10.1007/s00704-017-2246-y
https://doi.org/10.1002/2017WR021682
https://doi.org/10.1007/s00484-018-1509-3
https://doi.org/10.1007/s00484-018-1509-3
https://doi.org/10.1007/s00704-018-2476-7
https://doi.org/10.1007/s00704-018-2392-x
https://doi.org/10.1007/s00704-018-2532-3
https://doi.org/10.1002/joc.5221
https://doi.org/10.1007/s10533-017-0350-9
https://doi.org/10.1007/s11269-018-1914-8
https://doi.org/10.1007/s10584-018-2145-y
https://doi.org/10.1007/s10584-018-2158-6
https://doi.org/10.1007/s00382-018-4183-6
https://doi.org/10.1007/s00382-018-4234-z
https://doi.org/10.1007/s11600-018-0165-7
https://doi.org/10.5194/bg-14-799-2017
https://doi.org/10.5194/bg-14-1181-2017
https://doi.org/10.5194/bg-14-2069-2017
https://doi.org/10.5194/hess-21-4785-2017
https://doi.org/10.5194/hess-21-1189-2017
https://doi.org/10.5194/hess-22-611-2018
https://doi.org/

In [None]:
# add a step to remove results not from an approved list of publishers/journals?

# how to make a list of approved publishers?

In [10]:
crossRef_df_gbif_filtered2_deduplicated

Unnamed: 0,relation_type_id,source_id,obj_id,subj_id,subj_work_type_id,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed,pub_Title,pub_authors,publisher
0,references,crossref,https://doi.org/10.5285/6c6c9203-7333-4d96-88a...,https://doi.org/10.1007/s11368-018-1990-7,Info not given,"Land Cover Map 2015 (vector, GB)","[Rowland, C.S., Morton, R.D., Carrasco, L., Mc...",2017-04-12,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",61.0,"[{'id': '10.1007/s11368-018-1990-7', 'type': '...",Environmental Information Data Centre (EIDC),Evaluating tracer selection for catchment sedi...,"[[Hugh G., Smith], [Daljit Singh, Karam], [Amy...",Springer Science and Business Media LLC
1,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.5194/gmd-11-1377-2018,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA),LPJmL4 – a dynamic global vegetation model wit...,"[[Sibyll, Schaphoff], [Matthias, Forkel], [Chr...",Copernicus GmbH
2,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.5194/gmd-11-1343-2018,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA),LPJmL4 – a dynamic global vegetation model wit...,"[[Sibyll, Schaphoff], [Werner, von Bloh], [Anj...",Copernicus GmbH
3,references,crossref,https://doi.org/10.5285/18BE23F8-D252-482D-8AF...,https://doi.org/10.1007/s00704-017-2246-y,Info not given,CRU TS3.22: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2014,Issued,[],5.0,"[{'id': '10.3390/atmos13030421', 'type': 'dois...",Centre for Environmental Data Analysis (CEDA),Variability of temperature properties over Ken...,"[[Victor, Ongoma], [Haishan, Chen], [Chujie, G...",Springer Science and Business Media LLC
4,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.1002/2017WR021682,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA),Intercomparison and Uncertainty Assessment of ...,"[[Anna A., Sörensson], [Romina C., Ruscica]]",American Geophysical Union (AGU)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7081,references,crossref,https://doi.org/10.5285/b06d44e6-324d-4e19-bf7...,https://doi.org/10.1144/sp527-2022-58,journal-article,UKGEOS Cheshire TH0424 Initial Core Scanning D...,[BGS Core Scanning Facility],2022-03-15,Available,[],0.0,[],National Geoscience Data Centre (NGDC),Unlocking national treasures: the core scannin...,"[[M., Damaschke], [M. W., Fellgett], [M. P. A....",Geological Society of London
7082,references,crossref,https://doi.org/10.5285/a29c5465-b138-234d-e05...,https://doi.org/10.1007/s13157-021-01443-4,journal-article,The GEBCO_2020 Grid - a continuous terrain mod...,[GEBCO Bathymetric Compilation Group 2020],2019-04/2020-03,Collected,[],56.0,"[{'id': '10.5194/essd-13-2165-2021', 'type': '...",British Oceanographic Data Centre (BODC),Coastal Wetlands Exposure to Storm Surge and W...,"[[Felício, Cassalho], [Tyler W., Miesse], [And...",Springer Science and Business Media LLC
7100,references,crossref,https://doi.org/10.5285/f91cd3ee7b6243d5b7d41b...,https://doi.org/10.1175/jtech-d-21-0179.1,journal-article,ESA Sea State Climate Change Initiative (Sea_S...,"[Piollé, Jean-François, Dodet, Guillaume, Quil...",2020-01-30,Updated,"[{'relationType': 'IsPartOf', 'relatedIdentifi...",1.0,"[{'id': '10.3389/fenrg.2022.929625', 'type': '...",Centre for Environmental Data Analysis (CEDA),Error Characterization of Significant Wave Hei...,"[[Guillaume, Dodet], [Saleh, Abdalla], [Matias...",American Meteorological Society
7101,references,crossref,https://doi.org/10.5285/3b602f74-8374-1e90-e05...,https://doi.org/10.1038/s41558-021-01127-1,journal-article,GESLA (Global Extreme Sea Level Analysis) high...,"[Woodworth, Philip L, Hunter, John R, Marcos M...",1846-01-04/2015-05-01,Collected,[],1.0,"[{'id': '10.5194/gmd-15-2035-2022', 'type': 'd...",British Oceanographic Data Centre (BODC),Extreme sea levels at different global warming...,"[[Claudia, Tebaldi], [Roshanka, Ranasinghe], [...",Springer Science and Business Media LLC


### Get the citation string (APA format) of the publication that has cited the dataset

In [None]:
# TAKES A LONG TIME - can skip this if you don't want this info
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    
    r = requests.get((pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa"})
    #print(r.status_code)
    citationStrList.append(r.text) # add the citation strings to the list
    
crossRef_df_gbif_filtered2_deduplicated['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [None]:
crossRef_df_gbif_filtered2_deduplicated

In [12]:
# load results
crossRef_df = pd.read_csv('C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations - copy of older version of code 14-03-23/Crossref API/event_data_notebooks-master/NERC_EDS/dataset_citation_publication_info_2000-01-01_to_2023-03-01_retrieved_03032023.csv')
dataset_df = pd.read_csv('C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations - copy of older version of code 14-03-23/Crossref API/event_data_notebooks-master/NERC_EDS/dataset_citation_counts_2000-01-01_to_2023-03-01_retrieved_03032023.csv')

In [14]:
# DataCite results - create df of 
dataset_df['DataCite_Citations_list'][2]


"[{'id': '10.3390/atmos13030421', 'type': 'dois'}, {'id': '10.1016/j.ebiom.2016.03.046', 'type': 'dois'}, {'id': '10.1080/08898480.2019.1592638', 'type': 'dois'}, {'id': '10.1007/s13143-016-0011-2', 'type': 'dois'}, {'id': '10.1029/2018jd030150', 'type': 'dois'}]"

In [11]:
# Output csv file

from datetime import date
today = date.today()

crossRef_df_processed_filename = results_folder_path + 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = results_folder_path + 'dataset_citation_counts_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/dataset_citation_publication_info_1990-01-01_to_2023-04-19_retrieved_20042023.csv
C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/dataset_citation_counts_1990-01-01_to_2023-04-19_retrieved_20042023.csv


In [8]:
# Output csv file if based on list of DOIs rather than dates

from datetime import date
today = date.today()
doi_list_name = "marika_doi_list_scholix"

crossRef_df_processed_filename = results_folder_path + 'dataset_citation_publication_info_' + doi_list_name + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = results_folder_path + 'dataset_citation_counts_' + doi_list_name + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/dataset_citation_publication_info_marika_doi_list_scholix_retrieved_17032023.csv
C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/dataset_citation_counts_marika_doi_list_scholix_retrieved_17032023.csv
