### Workflow to collect citation information for datasets published by NERC data centres

In [10]:
import sys
sys.path.insert(0, '..')

# module to run event data queries
from crossRef_fun import getDataCiteInfo, eventData, eventRecord
import os # some file manipulations
from math import ceil
import json
import pandas as pd
import time
import re
import requests
import datetime

In [None]:
email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "2023-01-01"
end_date = "2023-03-01" # something to check date is valid
datetime.date.fromisoformat(start_date)
datetime.date.fromisoformat(end_date)

# filename to save json event data to
filename = "event_data_" + prefix + "_" + start_date + "_" + end_date + ".json"

# Set up the query
ed = mrced2.eventData(email = email, outputFile = filename)
ed.buildQuery({'obj-id.prefix' : prefix, 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}) 

# run the query to determine number of events
ed.runQuery(retry = 5) # scholix = False - can query scholix api as well - worth exploring

# calculate how many pages will need to be iterated over
num_pages = ceil(ed.events.count()/1000)

# set up folder to result jsons into
results_folder = "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date
os.mkdir(results_folder) # not able to overwrite folder of the same name - delete folder and re-write?, or, add a folder with a new name each time?

# find info from all the pages
ed.getAllPages(num_pages, {'rows': 1000, 'obj-id.prefix' : prefix, 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}, fileprefix = (results_folder + '/page')) # 

In [1]:
# Code for getting results based on a list of DOIs rather than a date range
import sys
sys.path.insert(0, '..')

import mrced2 # module to run event data queries
from getDataCiteInfo import getDataCiteInfo
import os # some file manipulations
from math import ceil
import json
import pandas as pd
import time
import re
import requests
import datetime

email = "matnic@ceh.ac.uk"

# import list of DOIs to find event data for
doi_list = pd.read_csv("marika_doi_list.csv", header = None)

# set up folder to result jsons into
results_folder = "NERC_EDS_events_from_doi_list_scholix"
os.makedirs(results_folder, exist_ok=True)

for count, doi in enumerate(doi_list[0]):
    print(doi)

    fileDoi = doi.replace("/", "_")
    
    # Set up the query
    # filename to save json event data to
    filename = f"{results_folder}/event_data_{fileDoi}_{count}.json"
    ed = mrced2.eventData(email=email, outputFile=filename)
    ed.buildQuery({'obj-id': doi})
    
    # run the query to determine number of events
    ed.runQuery(retry=5) # scholix = False - can query scholix api as well - worth exploring
    
    # calculate how many pages will need to be iterated over
    num_pages = ceil(ed.events.count()/1000)
    
    # find info from all the pages
    ed.getAllPages(num_pages, {'rows': 1000, 'obj-id': doi}, fileprefix=f"{results_folder}/{fileDoi}_page{count}") # fileDoi to give each result json a unique name, otherwise it writes over previous results 

print('Done!')


10.5285/0a7d65e8-8bc8-46e5-ab72-ee64ed851583
https://api.eventdata.crossref.org/v1/events/scholix?mailto=Anonymous&rows=1000&obj-id=10.5285/0a7d65e8-8bc8-46e5-ab72-ee64ed851583
Event Data query started...
API query complete  200
output file written to NERC_EDS_events_from_doi_list_scholix/event_data_10.5285_0a7d65e8-8bc8-46e5-ab72-ee64ed851583_0.json
9 events found
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=88c6e237-05c7-46e2-9ff3-730cbc5574a1&rows=1000&obj-id=10.5285/0a7d65e8-8bc8-46e5-ab72-ee64ed851583
Event Data query started...
API query complete  200
output file written to NERC_EDS_events_from_doi_list_scholix/10.5285_0a7d65e8-8bc8-46e5-ab72-ee64ed851583_page00000.json
10.5285/1d78e01a-a9c1-4371-8482-1c1b57d9661f
https://api.eventdata.crossref.org/v1/events/scholix?mailto=Anonymous&rows=1000&obj-id=10.5285/1d78e01a-a9c1-4371-8482-1c1b57d9661f
Event Data query started...
API query complete  200
output file written to NERC_EDS_events_from_doi_list_scholix/e

In [2]:
# instance of a class to interpret the events
jd1 = mrced2.eventRecord()

# get all the filenames
files = os.listdir(results_folder)

# load the json event data from multiple files
jd1.mergeJsons(files, folder = results_folder)

## filter out twitter wikipedia etc - later add options to include these info
filters = {"source_id" : ['twitter', 'wikipedia', 'newsfeed', 'wordpressdotcom', 'reddit-links']}
filtered_info = jd1.filter(filters, mode = 'NOT')

# collect relevant citation info for NERC project
citationInfo = filtered_info.collectCitationInfo()

# convert to dataframe
crossRef_df = pd.DataFrame(citationInfo)

# filter out gbif registrant code prefix 10.15468
crossRef_df_gbif_filtered = crossRef_df[~crossRef_df.subj_id.str.contains("10.15468")]

# filter out relationship_type_id values that we don't want - is_referenced_by, discusses
crossRef_df_gbif_filtered2 = crossRef_df_gbif_filtered[~crossRef_df_gbif_filtered.relation_type_id.str.contains("is_referenced_by|discusses|is_new_version_of|is_supplemented_by|is_previous_version_of")]

# remove duplicate subj_ids for each obj_id - e.g. 10.5285/a7f28dea-64f7-43b5-bc39-a6cfcdeefbda has multiple references from 10.5285/65140444-b5fa-4a5e-9ab4-e86c106051e2
# find rows where obj_id and subj_id are the same - should I match any other columns?
dups = crossRef_df_gbif_filtered2.duplicated(subset=['obj_id', 'subj_id'])
crossRef_df_gbif_filtered2_deduplicated = crossRef_df_gbif_filtered2.drop(crossRef_df_gbif_filtered2[dups].index)



In [3]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors, dataCite_df) = getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

https://doi.org/10.5285/507A5E1F-E056-454C-8FF6-D185F3DA8556 https://doi.org/10.29110/soylemdergi.1169829
API response:  200
https://doi.org/10.5285/0995e94d-6d42-40c1-8ed4-5090d82471e1 https://doi.org/10.1075/silv.27.11bla
API response:  200
https://doi.org/10.5285/0a7d65e8-8bc8-46e5-ab72-ee64ed851583 https://doi.org/10.1111/icad.12578
API response:  200
https://doi.org/10.5285/10874370-bc58-4d23-a118-ea07df8a07f2 https://doi.org/10.5194/essd-10-951-2018
API response:  200
https://doi.org/10.5285/1d78e01a-a9c1-4371-8482-1c1b57d9661f https://doi.org/10.1002/esp.4005
API response:  200
https://doi.org/10.5285/331659d7-da72-48a2-9b52-63c003557990 https://doi.org/10.1007/s10661-022-10118-4
API response:  200
https://doi.org/10.5285/33604ea0-c238-4488-813d-0ad9ab7c51ca https://doi.org/10.1007/s10584-018-2145-y
API response:  200
https://doi.org/10.5285/5dc179dc-f692-49ba-9326-a6893a503f6e https://doi.org/10.5194/hess-23-4011-2019
API response:  200
https://doi.org/10.5285/5f0605e4-aa2a-48a

Pass citation info to datacite API to collect relevant info on the datasets, data centres etc

In [4]:
errors
dataCite_df

Unnamed: 0,dataset_DOI,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed
0,https://doi.org/10.5285/507A5E1F-E056-454C-8FF...,"Water chemistry, hydrology and fluvial carbon ...","[Vihermaa, L. E., Waldron, S.]",2015-06-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",22,"[{'id': '10.18778/0208-600x.78.05', 'type': 'd...",Environmental Information Data Centre (EIDC)
1,https://doi.org/10.5285/0995e94d-6d42-40c1-8ed...,UK gridded population 2011 based on Census 201...,"[Reis, S., Liska, T., Steinle, S., Carnell, E....",2017-11-22,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",1,"[{'id': '10.1016/j.envint.2018.10.005', 'type'...",Environmental Information Data Centre (EIDC)
2,https://doi.org/10.5285/0a7d65e8-8bc8-46e5-ab7...,Moth trends for Britain and Ireland from the R...,"[Harrower, C.A., Bell, J.R., Blumgart, D., Bot...",2020-04-20,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",2,"[{'id': '10.1111/icad.12578', 'type': 'dois'},...",Environmental Information Data Centre (EIDC)
3,https://doi.org/10.5285/10874370-bc58-4d23-a11...,Climate hydrology and ecology research support...,"[Robinson, E.L., Blyth, E., Clark, D.B., Comyn...",2016-12-12,Submitted,"[{'relationType': 'IsNewVersionOf', 'relatedId...",4,"[{'id': '10.5194/essd-10-951-2018', 'type': 'd...",Environmental Information Data Centre (EIDC)
4,https://doi.org/10.5285/1d78e01a-a9c1-4371-848...,"Land Cover Map 2007 (vector, GB)","[Morton, R.D., Rowland, C., Wood, C., Meek, L....",2011-04-08,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",9,"[{'id': '10.1002/esp.4005', 'type': 'dois'}, {...",Environmental Information Data Centre (EIDC)
5,https://doi.org/10.5285/331659d7-da72-48a2-9b5...,High resolution water quality and flow monitor...,"[Hawkins, C.E, Kelly, T.J., Loewenthal, M., Sm...",2019-04-03,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",1,"[{'id': '10.1007/s10661-022-10118-4', 'type': ...",Environmental Information Data Centre (EIDC)
6,https://doi.org/10.5285/33604ea0-c238-4488-813...,Gridded estimates of daily and monthly areal r...,"[Tanguy, M., Dixon, H., Prosdocimi, I., Morris...",2016-11-04,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",47,"[{'id': '10.1016/j.jhydrol.2018.07.034', 'type...",Environmental Information Data Centre (EIDC)
7,https://doi.org/10.5285/5dc179dc-f692-49ba-932...,Gridded estimates of daily and monthly areal r...,"[Tanguy, M., Dixon, H., Prosdocimi, I., Morris...",2014-05-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",25,"[{'id': '10.5194/hess-21-1189-2017', 'type': '...",Environmental Information Data Centre (EIDC)
8,https://doi.org/10.5285/5f0605e4-aa2a-48ab-b47...,ITE Land Classification of Great Britain 2007,"[Bunce, R.G.H., Barr, C.J., Clarke, R.T., Howa...",2007-05-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",5,"[{'id': '10.1111/ele.13157', 'type': 'dois'}, ...",Environmental Information Data Centre (EIDC)
9,https://doi.org/10.5285/6b0c4358-2bf3-4924-aa8...,High-resolution global topographic index values,"[Marthews, T.R., Dadson, S.J., Lehner, B., Abe...",2015-03-03,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",8,"[{'id': '10.5194/hess-19-91-2015', 'type': 'do...",Environmental Information Data Centre (EIDC)


In [None]:
# # What to do with publication type information? if posted content probably want to ignore?
# crossRef_df_gbif_filtered2_deduplicated['obj_id']

# # for each dataset count each type of publication that cites it
# for data_doi in crossRef_df_processed


In [5]:
# merge crossRef_df and dataCite_df to get dataset info in crossref_df
# for each column create a mapping pair of dataset DOI and that column name, but skips first column 'Dataset_DOI' in loop
for ii in dataCite_df.columns[1:]: # 
    
    # create dictionary of data_doi, value pairs 
    d = dataCite_df.set_index('dataset_DOI')[ii].to_dict()
    
    # use the data doi to map the dictionary to the crossref_df
    crossRef_df_gbif_filtered2_deduplicated.loc[:,ii] = crossRef_df_gbif_filtered2_deduplicated.obj_id.map(d)


    
# create data frame that just lists each dataset and has citations counts from crossref, scholex, datacite etc
# remove rows from crossref with duplicated obj_id
dups = crossRef_df_gbif_filtered2_deduplicated.duplicated('obj_id')
dataset_df = crossRef_df_gbif_filtered2_deduplicated.drop(crossRef_df_gbif_filtered2_deduplicated[dups].index)
dataset_df = dataset_df.drop(['relation_type_id', 'source_id', 'subj_id', 'subj_work_type_id'], axis = 1) # 'subj_work_type_id'

# count how many times each dataset DOI appears in crossRef_df_processed and add this number to dataset_df
crossRef_citation_counts = crossRef_df_gbif_filtered2_deduplicated['obj_id'].value_counts()

# need counts that include or exclude different relation_type_ids and subj_work_type_id

# create dictionary of data_doi, crossRef_citation_counts 
d = crossRef_citation_counts.to_dict()

# use the data doi to map the dictionary to the dataset_df
dataset_df['crossRef_citation_count'] = dataset_df.obj_id.map(d)



In [None]:
# # pass publication DOIs to DOI.org to determine type of publication using checkDOIpubType function defined above
# pubTypeList = []
# pubDOI = []

# for count, doi in enumerate(crossRef_df_processed['subj_id']):
#     doiComponents = doi.split('/')[-2:]
#     doi = doiComponents[0] + "/" + doiComponents[1]
#     # print(type(doi))
#     pubType = checkDOIpubType(doi)
#     pubTypeList.append(pubType)
#     pubDOI.append(doi)
    
#     # add code to catch retries limit exceeded - might need to be in function itself?
#     # e.g. https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
    
#     time.sleep(0.5)
#     # if count % 200 == 0: # if count is a multiple of 200 wait for a bit
#     #         time.sleep(180)

# print('Done!')

# crossRef_df_processed['publicationType'] = pubTypeList
# crossRef_df_processed['publication_DOI'] = pubDOI


# # split publicationType column into publicationType1 and publicationSubType columns
# crossRef_df_processed[['publicationType1','publicationSubType']] = pd.DataFrame(crossRef_df_processed['publicationType'].tolist(), index=crossRef_df_processed.index)


# # determine publication type for records where DOI.org API call failed - ~10 mins
# newPubTypeList = []

# # get the unknown rows from scholex_df pubtype column and the pubDOI
# for pubType, pubDOI in zip(crossRef_df_processed['publicationType1'],crossRef_df_processed['publication_DOI']):
    
#     if pubType == 'unknown':

#         # need a catcher to make sure pubDOI is a single ID - in the cases where there was no DOI there is more than one ID recorded
#         if len(pubDOI) < 100: # if the length is less than 100 (arbitrarily) then it will be a single DOI
#             pass
#         else: # if there is no DOI skip this record
#             newPubTypeList.append(pubType) # leave it the same
#             #newPubType = pubType 
#             continue
        
#         # determine if crossref or datacite supplies the DOI
#         print('Pub DOI: ', pubDOI)
#         r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
#         DOIregistry = r.json()[0]['RA']
#         print(DOIregistry)

#         # query the crossref or datacite API
#         if DOIregistry == 'DataCite':
#             # ask the Datacite API what type of publication
#             r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

#         elif DOIregistry == 'Crossref':
#             r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['message']['type']) # could also add 'subtype':r.json()['message']['subtype']
#         else:
#             print('Unknown DOI registry')
        
#     else: 
#         newPubTypeList.append(pubType) # in the cases where the pubType is not unknown keep it the same

# crossRef_df_processed['newPubTypeList'] = newPubTypeList
# print("Done")

In [None]:
pubDOI = '10.1007/s11368-018-1990-7'
r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
DOIregistry = r.json()[0]['RA']
print(DOIregistry)

In [None]:
# query the crossref or datacite API
if DOIregistry == 'DataCite':
    # ask the Datacite API what type of publication
    r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
    print(r.status_code)
    newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

In [None]:
r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
print(r.status_code)
r.json()['message']

In [14]:

# add a section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
doi_url = 'https://doi.org/10.5194/gmd-11-1377-2018'
print(doi_url)
r = requests.get(doi_url, headers={"Accept": "application/json"}) # sometimes throws up unexpected errors

# sometimes the API call returns info in a format that is not JSON 
r.json()

https://doi.org/10.5194/gmd-11-1377-2018


{'indexed': {'date-parts': [[2023, 2, 9]],
  'date-time': '2023-02-09T14:27:24Z',
  'timestamp': 1675952844171},
 'reference-count': 112,
 'publisher': 'Copernicus GmbH',
 'issue': '4',
 'license': [{'start': {'date-parts': [[2018, 4, 12]],
    'date-time': '2018-04-12T00:00:00Z',
    'timestamp': 1523491200000},
   'content-version': 'unspecified',
   'delay-in-days': 0,
   'URL': 'https://creativecommons.org/licenses/by/4.0/'}],
 'content-domain': {'domain': [], 'crossmark-restriction': False},
 'abstract': '<jats:p>Abstract. The dynamic global vegetation model LPJmL4 is a process-based model that simulates climate and land use change impacts on the terrestrial biosphere, agricultural production, and the water and carbon cycle. Different versions of the model have been developed and applied to evaluate the role of natural and managed ecosystems in the Earth system and the potential impacts of global environmental change. A comprehensive model description of the new model version, LPJ

In [6]:
# very slow if crossRef_df_gbif_filtered2_deduplicated is large - about 1 hour for everything from year 2000
# CHANGE THIS TO CROSSREF API BECAUSE DOI.ORG API IS bad
# section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
pub_info = []
for pubdoi in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    r = requests.get(pubdoi, headers={"Accept": "application/json"})
    
    try:
        title = r.json()['title']
    except:
        title = "Info not given"
    
    try:           
        authors = []
        for jj in range(len(r.json()['author'])):
            authors.append([r.json()['author'][jj]['given'],r.json()['author'][jj]['family']])
    except:
        authors = "Info not given"
    
    try:
        publisher = r.json()['publisher']
    except:
        publisher = "Info not given"
        
    pub_info.append({
            'pub_doi': pubdoi,
            'pub_Title': title,
            'pub_authors': authors,
            'publisher': publisher
        # add publication date to this - in order to check if this is before the dataset publication date - could be a way to filter out dodgy results
    })
    
    print(pubdoi)
    
# add new publication info columns to dataframe
pubInfo_df = pd.DataFrame(pub_info)

# loop through new columns to be added to df
for ii in pubInfo_df.columns[1:]:
    # create dictionary of doi, value pairs 
    d = pubInfo_df.set_index('pub_doi')[ii].to_dict()
    
    # use the doi to map the dictionary to crossRef_df_gbif_filtered2_deduplicated
    crossRef_df_gbif_filtered2_deduplicated.loc[:,ii] = crossRef_df_gbif_filtered2_deduplicated.subj_id.map(d)
    
print("Done!")

https://doi.org/10.29110/soylemdergi.1169829
https://doi.org/10.1186/s12888-022-04460-7
https://doi.org/10.34234/ded.1166387
https://doi.org/10.46652/rgn.v6i27.766
https://doi.org/10.1007/978-3-030-78280-1_11
https://doi.org/10.3145/epi.2023.ene.06
https://doi.org/10.22430/24223182.1800
https://doi.org/10.4236/jfrm.2021.104023
https://doi.org/10.24142/rvc.n26a4
https://doi.org/10.22495/cbsrv3i2art10
https://doi.org/10.1075/silv.27.11bla
https://doi.org/10.1038/s41598-021-81716-4
https://doi.org/10.1111/icad.12578
https://doi.org/10.1007/s10531-022-02469-8
https://doi.org/10.5194/essd-10-951-2018
https://doi.org/10.1002/esp.4005
https://doi.org/10.1016/j.ecolind.2022.108610
https://doi.org/10.1007/s10661-022-10118-4
https://doi.org/10.1007/s10584-018-2145-y
https://doi.org/10.5194/hess-24-561-2020
https://doi.org/10.5194/gmd-12-2285-2019
https://doi.org/10.1016/j.ejrh.2021.100967
https://doi.org/10.3390/atmos12111533
https://doi.org/10.1088/1748-9326/ac7a4e
https://doi.org/10.1007/s1058

In [None]:
# add a step to remove results not from an approved list of publishers/journals?

# how to make a list of approved publishers?

In [7]:
crossRef_df_gbif_filtered2_deduplicated

Unnamed: 0,relation_type_id,source_id,obj_id,subj_id,subj_work_type_id,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed,pub_Title,pub_authors,publisher
0,references,crossref,https://doi.org/10.5285/507A5E1F-E056-454C-8FF...,https://doi.org/10.29110/soylemdergi.1169829,journal-article,"Water chemistry, hydrology and fluvial carbon ...","[Vihermaa, L. E., Waldron, S.]",2015-06-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",22,"[{'id': '10.18778/0208-600x.78.05', 'type': 'd...",Environmental Information Data Centre (EIDC),Katılımcı İnternet Sözlüklerinde Tahsin Yücel:...,"[[Cansu, AVCI], [Erdoğan, KARTAL]]",SOYLEM Filoloji Dergisi
1,references,crossref,https://doi.org/10.5285/507A5E1F-E056-454C-8FF...,https://doi.org/10.1186/s12888-022-04460-7,journal-article,"Water chemistry, hydrology and fluvial carbon ...","[Vihermaa, L. E., Waldron, S.]",2015-06-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",22,"[{'id': '10.18778/0208-600x.78.05', 'type': 'd...",Environmental Information Data Centre (EIDC),Work-related stress and associated factors amo...,"[[Tsegaye Adane, Birhan], [Muche, Ambissa], [T...",Springer Science and Business Media LLC
2,references,crossref,https://doi.org/10.5285/507A5E1F-E056-454C-8FF...,https://doi.org/10.34234/ded.1166387,journal-article,"Water chemistry, hydrology and fluvial carbon ...","[Vihermaa, L. E., Waldron, S.]",2015-06-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",22,"[{'id': '10.18778/0208-600x.78.05', 'type': 'd...",Environmental Information Data Centre (EIDC),Sınıf Öğretmeni Adaylarına Göre Geleneksel Çoc...,"[[Mustafa, EROL], [Mehmet Umut, AKBAKLA], [Nur...",Degerler Egitimi Dergisi
3,references,crossref,https://doi.org/10.5285/507A5E1F-E056-454C-8FF...,https://doi.org/10.46652/rgn.v6i27.766,journal-article,"Water chemistry, hydrology and fluvial carbon ...","[Vihermaa, L. E., Waldron, S.]",2015-06-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",22,"[{'id': '10.18778/0208-600x.78.05', 'type': 'd...",Environmental Information Data Centre (EIDC),Gamificación: Reflexiones teóricas desde el en...,"[[Dayana Sofia, Barros-Pozo], [Ricardo Patrici...",Centro de Investigaciones en Ciencias Sociales...
4,references,crossref,https://doi.org/10.5285/507A5E1F-E056-454C-8FF...,https://doi.org/10.1007/978-3-030-78280-1_11,book-chapter,"Water chemistry, hydrology and fluvial carbon ...","[Vihermaa, L. E., Waldron, S.]",2015-06-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",22,"[{'id': '10.18778/0208-600x.78.05', 'type': 'd...",Environmental Information Data Centre (EIDC),Humour and Politics: A Discursive Approach to ...,"[[Maria Aldina, Marques]]",Springer International Publishing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,references,crossref,https://doi.org/10.5285/d7da6cb9-104b-4dbc-b70...,https://doi.org/10.1016/j.agee.2021.107755,journal-article,"Woody linear features framework, Great Britain...","[Scholefield, P.A., Morton, R.D., Rowland, C.S...",2016-11-08,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",3,"[{'id': '10.1111/1365-2664.13412', 'type': 'do...",Environmental Information Data Centre (EIDC),Does agri-environment scheme participation in ...,"[[Mike, Image], [Emma, Gardner], [Yann, Clough...",Elsevier BV
448,is-part-of,crossref,https://doi.org/10.5285/f2856ee8-da6e-4b67-bed...,https://doi.org/10.5194/hess-22-611-2018,journal-article,Gridded estimates of daily and monthly areal r...,"[Tanguy, M., Dixon, H., Prosdocimi, I., Morris...",2015-09-25,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",5,"[{'id': '10.5194/hess-22-611-2018', 'type': 'd...",Environmental Information Data Centre (EIDC),"A large set of potential past, present and fut...","[[Benoit P., Guillod], [Richard G., Jones], [S...",Copernicus GmbH
465,references,crossref,https://doi.org/10.5285/f3723162-4fed-4d9d-92c...,https://doi.org/10.1007/s00799-016-0175-5,Info not given,Future flows hydrology data,"[Haxton, T., Crooks, S., Jackson, C.R., Barkwi...",2012-04-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",6,"[{'id': '10.5194/hess-22-5387-2018', 'type': '...",Environmental Information Data Centre (EIDC),Implementation of a workflow for publishing ci...,"[[Kathryn A., Harrison], [Daniel G., Wright], ...",Springer Science and Business Media LLC
466,is-part-of,crossref,https://doi.org/10.5285/f3723162-4fed-4d9d-92c...,https://doi.org/10.5194/hess-22-5387-2018,journal-article,Future flows hydrology data,"[Haxton, T., Crooks, S., Jackson, C.R., Barkwi...",2012-04-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",6,"[{'id': '10.5194/hess-22-5387-2018', 'type': '...",Environmental Information Data Centre (EIDC),Future hot-spots for hydro-hazards in Great Br...,"[[Lila, Collet], [Shaun, Harrigan], [Christel,...",Copernicus GmbH


### Get the citation string (APA format) of the publication that has cited the dataset

In [None]:
# TAKES A LONG TIME - can skip this if you don't want this info
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    
    r = requests.get((pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa"})
    #print(r.status_code)
    citationStrList.append(r.text) # add the citation strings to the list
    
crossRef_df_gbif_filtered2_deduplicated['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [None]:
crossRef_df_gbif_filtered2_deduplicated

In [None]:
# add section comparing counts from crossref, DataCite and scholex
# first build a tool that takes one dataset and compares citation results from each source


In [None]:
# Output csv file

from datetime import date
today = date.today()

crossRef_df_processed_filename = 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = 'dataset_citation_counts_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

In [8]:
# Output csv file if based on list of DOIs rather than dates

from datetime import date
today = date.today()
doi_list_name = "marika_doi_list_scholix"

crossRef_df_processed_filename = 'dataset_citation_publication_info_' + doi_list_name + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = 'dataset_citation_counts_' + doi_list_name + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

dataset_citation_publication_info_marika_doi_list_scholix_retrieved_08032023.csv
dataset_citation_counts_marika_doi_list_scholix_retrieved_08032023.csv
