### Workflow to collect citation information for datasets published by NERC data centres

In [16]:
import sys
sys.path.insert(0, '..')

import mrced2 # module to run event data queries
from getDataCiteInfo import getDataCiteInfo
import os # some file manipulations
from math import ceil
import json
import pandas as pd
import time
import re
import requests
import datetime

email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "2000-01-01"
end_date = "2023-03-01" # something to check date is valid
datetime.date.fromisoformat(start_date)
datetime.date.fromisoformat(end_date)

# filename to save json event data to
filename = "event_data_" + prefix + "_" + start_date + "_" + end_date + ".json"

# Set up the query
ed = mrced2.eventData(email = email, outputFile = filename)
ed.buildQuery({'obj-id.prefix' : prefix, 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}) 

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&rows=1000&obj-id.prefix=10.5285&from-occurred-date=2000-01-01&until-occurred-date=2023-03-01


In [17]:
# run the query to determine number of events
ed.runQuery(retry = 5) # scholix = False - can query scholix api as well - worth exploring

# calculate how many pages will need to be iterated over
num_pages = ceil(ed.events.count()/1000)

# set up folder to result jsons into
results_folder = "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date
os.mkdir(results_folder) # not able to overwrite folder of the same name - delete folder and re-write?, or, add a folder with a new name each time?

# find info from all the pages
ed.getAllPages(num_pages, {'rows': 1000, 'obj-id.prefix' : prefix, 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}, fileprefix = (results_folder + '/page')) # 


Event Data query started...
API query complete  200
output file written to event_data_10.5285_2000-01-01_2023-03-01.json
43667 events found
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=2162cf3a-79be-4bd3-aaa7-1bdb682872b5&rows=1000&obj-id.prefix=10.5285&from-occurred-date=2000-01-01&until-occurred-date=2023-03-01
Event Data query started...
API query complete  200
output file written to NERC_EDS_events_from_2000-01-01_up_to_2023-03-01/page0000.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=ec36cf84-461c-40bb-8342-ad101145241a&rows=1000&obj-id.prefix=10.5285&from-occurred-date=2000-01-01&until-occurred-date=2023-03-01
Event Data query started...
API query complete  200
output file written to NERC_EDS_events_from_2000-01-01_up_to_2023-03-01/page0001.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=f5ad304b-38d0-4923-8af4-7088a3d77b90&rows=1000&obj-id.prefix=10.5285&from-occurred-date=2000-01-01&until-occurred-dat

In [18]:
# instance of a class to interpret the events
jd1 = mrced2.eventRecord()

# get all the filenames
files = os.listdir(results_folder)

# load the json event data from multiple files
jd1.mergeJsons(files, folder = results_folder)

## filter out twitter wikipedia etc - later add options to include these info
filters = {"source_id" : ['twitter', 'wikipedia', 'newsfeed', 'wordpressdotcom', 'reddit-links']}
filtered_info = jd1.filter(filters, mode = 'NOT')

# collect relevant citation info for NERC project
citationInfo = filtered_info.collectCitationInfo()

# convert to dataframe
crossRef_df = pd.DataFrame(citationInfo)

# filter out gbif registrant code prefix 10.15468
crossRef_df_gbif_filtered = crossRef_df[~crossRef_df.subj_id.str.contains("10.15468")]

# filter out relationship_type_id values that we don't want - is_referenced_by, discusses
crossRef_df_gbif_filtered2 = crossRef_df_gbif_filtered[~crossRef_df_gbif_filtered.relation_type_id.str.contains("is_referenced_by|discusses|is_new_version_of|is_supplemented_by|is_previous_version_of")]

# remove duplicate subj_ids for each obj_id - e.g. 10.5285/a7f28dea-64f7-43b5-bc39-a6cfcdeefbda has multiple references from 10.5285/65140444-b5fa-4a5e-9ab4-e86c106051e2
# find rows where obj_id and subj_id are the same - should I match any other columns?
dups = crossRef_df_gbif_filtered2.duplicated(subset=['obj_id', 'subj_id'])
crossRef_df_gbif_filtered2_deduplicated = crossRef_df_gbif_filtered2.drop(crossRef_df_gbif_filtered2[dups].index)



In [19]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors_df, dataCite_df) = getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

https://doi.org/10.5285/6c6c9203-7333-4d96-88ab-78925e7a4e73 https://doi.org/10.1007/s11368-018-1990-7
API response:  200
https://doi.org/10.5285/4c7fdfa6-f176-4c58-acee-683d5e9d2ed5 https://doi.org/10.5194/gmd-11-1377-2018
API response:  200
https://doi.org/10.5285/18BE23F8-D252-482D-8AF9-5D6A2D40990C https://doi.org/10.1007/s00704-017-2246-y
API response:  200
https://doi.org/10.5285/e1d33b37-f1d4-4234-a0d5-8bf4e657f653 https://doi.org/10.1007/s00484-018-1509-3
API response:  200
https://doi.org/10.5285/4c9613ce-de52-41b1-9fde-7c41f9199686 https://doi.org/10.1007/s00484-018-1509-3
API response:  200
https://doi.org/10.5285/58a8802721c94c66ae45c3baa4d814d0 https://doi.org/10.1007/s00704-018-2476-7
API response:  200
https://doi.org/10.5285/475520d5-bad9-4d84-e053-6c86abc0b01b https://doi.org/10.1007/s10533-017-0350-9
API response:  200
https://doi.org/10.5285/bad1514f-119e-44a4-8e1e-442735bb9797 https://doi.org/10.1007/s11269-018-1914-8
API response:  200
https://doi.org/10.5285/33604

Pass citation info to datacite API to collect relevant info on the datasets, data centres etc

In [None]:
errors
dataCite_df

In [None]:
# # What to do with publication type information? if posted content probably want to ignore?
crossRef_df_gbif_filtered2_deduplicated['obj_id']

# # for each dataset count each type of publication that cites it
# for data_doi in crossRef_df_processed


In [24]:
# merge crossRef_df and dataCite_df to get dataset info in crossref_df
# for each column create a mapping pair of dataset DOI and that column name, but skips first column 'Dataset_DOI' in loop
for ii in dataCite_df.columns[1:]: # 
    
    # create dictionary of data_doi, value pairs 
    d = dataCite_df.set_index('dataset_DOI')[ii].to_dict()
    
    # use the data doi to map the dictionary to the crossref_df
    crossRef_df_gbif_filtered2_deduplicated.loc[:,ii] = crossRef_df_gbif_filtered2_deduplicated.obj_id.map(d)


In [25]:
crossRef_df_gbif_filtered2_deduplicated

Unnamed: 0,relation_type_id,source_id,obj_id,subj_id,subj_work_type_id,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed
0,references,crossref,https://doi.org/10.5285/6c6c9203-7333-4d96-88a...,https://doi.org/10.1007/s11368-018-1990-7,Info not given,"Land Cover Map 2015 (vector, GB)","[Rowland, C.S., Morton, R.D., Carrasco, L., Mc...",2017-04-12,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",61.0,"[{'id': '10.1007/s11368-018-1990-7', 'type': '...",Environmental Information Data Centre (EIDC)
1,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.5194/gmd-11-1377-2018,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
2,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.5194/gmd-11-1343-2018,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
3,references,crossref,https://doi.org/10.5285/18BE23F8-D252-482D-8AF...,https://doi.org/10.1007/s00704-017-2246-y,Info not given,CRU TS3.22: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2014,Issued,[],5.0,"[{'id': '10.3390/atmos13030421', 'type': 'dois...",Centre for Environmental Data Analysis (CEDA)
4,references,crossref,https://doi.org/10.5285/4c7fdfa6-f176-4c58-ace...,https://doi.org/10.1002/2017WR021682,Info not given,CRU TS3.23: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2015,Issued,[],2.0,"[{'id': '10.1007/978-3-030-92782-0_6', 'type':...",Centre for Environmental Data Analysis (CEDA)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,references,crossref,https://doi.org/10.5285/edf8febfdaad48abb2cbaf...,https://doi.org/10.1007/978-981-19-4476-5_5,book-chapter,CRU TS4.00: Climatic Research Unit (CRU) Time-...,[University Of East Anglia Climatic Research U...,2017-04-07,Updated,"[{'relationType': 'IsPartOf', 'relatedIdentifi...",3.0,"[{'id': '10.5194/hess-26-711-2022', 'type': 'd...",Centre for Environmental Data Analysis (CEDA)
6996,references,crossref,https://doi.org/10.5285/B79E887E-A2A7-4224-8FD...,https://doi.org/10.23887/jiku.v10i1.43287,journal-article,"Land Cover Map 2000 (vector, GB)","[Fuller, R.M., Smith, G.M., Sanderson, J.M., H...",2002-01-01,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",10.0,"[{'id': '10.1007/s00125-020-05087-7', 'type': ...",Environmental Information Data Centre (EIDC)
6999,references,datacite,https://doi.org/10.5285/97204415-683f-4d55-8b3...,https://doi.org/10.5285/4ffad557-1c3c-4ea7-a73...,Dataset,"Thwaites MELT: Conductivity, Temperature and D...","[Davis, Peter, Nicholls, Keith, Holland, David]",2020-01-09/2020-01-12,Collected,"[{'relationType': 'References', 'relatedIdenti...",1.0,[{'id': '10.5285/4ffad557-1c3c-4ea7-a73d-6d782...,Polar Data Centre (PDC)
7001,references,datacite,https://doi.org/10.5285/4ffad557-1c3c-4ea7-a73...,https://doi.org/10.5285/97204415-683f-4d55-8b3...,Dataset,"Thwaites MELT: Temperature, salinity and veloc...","[Davis, Peter, Nicholls, Keith, Holland, David]",2020-01-23/2021-02-01,Collected,"[{'relationType': 'References', 'relatedIdenti...",1.0,[{'id': '10.5285/97204415-683f-4d55-8b38-a2700...,Polar Data Centre (PDC)


In [26]:
# create data frame that just lists each dataset and has citations counts from crossref, scholex, datacite etc
# remove rows from crossref with duplicated obj_id
dups = crossRef_df_gbif_filtered2_deduplicated.duplicated('obj_id')
dataset_df = crossRef_df_gbif_filtered2_deduplicated.drop(crossRef_df_gbif_filtered2_deduplicated[dups].index)
dataset_df = dataset_df.drop(['relation_type_id', 'source_id', 'subj_id', 'subj_work_type_id'], axis = 1) # 'subj_work_type_id'

# count how many times each dataset DOI appears in crossRef_df_processed and add this number to dataset_df
crossRef_citation_counts = crossRef_df_gbif_filtered2_deduplicated['obj_id'].value_counts()

# need counts that include or exclude different relation_type_ids and subj_work_type_id

# create dictionary of data_doi, crossRef_citation_counts 
d = crossRef_citation_counts.to_dict()

# use the data doi to map the dictionary to the dataset_df
dataset_df['crossRef_citation_count'] = dataset_df.obj_id.map(d)



In [None]:
# # pass publication DOIs to DOI.org to determine type of publication using checkDOIpubType function defined above
# pubTypeList = []
# pubDOI = []

# for count, doi in enumerate(crossRef_df_processed['subj_id']):
#     doiComponents = doi.split('/')[-2:]
#     doi = doiComponents[0] + "/" + doiComponents[1]
#     # print(type(doi))
#     pubType = checkDOIpubType(doi)
#     pubTypeList.append(pubType)
#     pubDOI.append(doi)
    
#     # add code to catch retries limit exceeded - might need to be in function itself?
#     # e.g. https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
    
#     time.sleep(0.5)
#     # if count % 200 == 0: # if count is a multiple of 200 wait for a bit
#     #         time.sleep(180)

# print('Done!')

# crossRef_df_processed['publicationType'] = pubTypeList
# crossRef_df_processed['publication_DOI'] = pubDOI


# # split publicationType column into publicationType1 and publicationSubType columns
# crossRef_df_processed[['publicationType1','publicationSubType']] = pd.DataFrame(crossRef_df_processed['publicationType'].tolist(), index=crossRef_df_processed.index)


# # determine publication type for records where DOI.org API call failed - ~10 mins
# newPubTypeList = []

# # get the unknown rows from scholex_df pubtype column and the pubDOI
# for pubType, pubDOI in zip(crossRef_df_processed['publicationType1'],crossRef_df_processed['publication_DOI']):
    
#     if pubType == 'unknown':

#         # need a catcher to make sure pubDOI is a single ID - in the cases where there was no DOI there is more than one ID recorded
#         if len(pubDOI) < 100: # if the length is less than 100 (arbitrarily) then it will be a single DOI
#             pass
#         else: # if there is no DOI skip this record
#             newPubTypeList.append(pubType) # leave it the same
#             #newPubType = pubType 
#             continue
        
#         # determine if crossref or datacite supplies the DOI
#         print('Pub DOI: ', pubDOI)
#         r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
#         DOIregistry = r.json()[0]['RA']
#         print(DOIregistry)

#         # query the crossref or datacite API
#         if DOIregistry == 'DataCite':
#             # ask the Datacite API what type of publication
#             r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

#         elif DOIregistry == 'Crossref':
#             r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['message']['type']) # could also add 'subtype':r.json()['message']['subtype']
#         else:
#             print('Unknown DOI registry')
        
#     else: 
#         newPubTypeList.append(pubType) # in the cases where the pubType is not unknown keep it the same

# crossRef_df_processed['newPubTypeList'] = newPubTypeList
# print("Done")

In [None]:
pubDOI = '10.1007/s11368-018-1990-7'
r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
DOIregistry = r.json()[0]['RA']
print(DOIregistry)

In [None]:
# query the crossref or datacite API
if DOIregistry == 'DataCite':
    # ask the Datacite API what type of publication
    r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
    print(r.status_code)
    newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

In [38]:
r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
print(r.status_code)
r.json()['message']

200


{'indexed': {'date-parts': [[2023, 3, 1]],
  'date-time': '2023-03-01T10:54:00Z',
  'timestamp': 1677668040578},
 'reference-count': 47,
 'publisher': 'Springer Science and Business Media LLC',
 'issue': '9',
 'license': [{'start': {'date-parts': [[2018, 4, 13]],
    'date-time': '2018-04-13T00:00:00Z',
    'timestamp': 1523577600000},
   'content-version': 'tdm',
   'delay-in-days': 0,
   'URL': 'http://www.springer.com/tdm'}],
 'funder': [{'DOI': '10.13039/100010665',
   'name': 'H2020 Marie Skłodowska-Curie Actions',
   'doi-asserted-by': 'publisher',
   'award': ['644320']},
  {'name': 'Malaysian Government Ministry of Higher Education'}],
 'content-domain': {'domain': ['link.springer.com'],
  'crossmark-restriction': False},
 'short-container-title': ['J Soils Sediments'],
 'published-print': {'date-parts': [[2018, 9]]},
 'DOI': '10.1007/s11368-018-1990-7',
 'type': 'journal-article',
 'created': {'date-parts': [[2018, 4, 13]],
  'date-time': '2018-04-13T01:25:40Z',
  'timestamp':

In [None]:

# add a section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
doi_url = 'https://doi.org/10.5194/gmd-11-1377-2018'
print(doi_url)
r = requests.get(doi_url, headers={"Accept": "application/json"}) # sometimes throws up unexpected errors

# sometimes the API call returns info in a format that is not JSON 
r.json()

In [27]:
# very slow if crossRef_df_gbif_filtered2_deduplicated is large - about 1 hour for everything from year 2000
# CHANGE THIS TO CROSSREF API BECAUSE DOI.ORG API IS bad
# section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
pub_info = []
for pubdoi in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    r = requests.get(pubdoi, headers={"Accept": "application/json"})
    
    try:
        title = r.json()['title']
    except:
        title = "Info not given"
    
    try:           
        authors = []
        for jj in range(len(r.json()['author'])):
            authors.append([r.json()['author'][jj]['given'],r.json()['author'][jj]['family']])
    except:
        authors = "Info not given"
    
    try:
        publisher = r.json()['publisher']
    except:
        publisher = "Info not given"
        
    pub_info.append({
            'pub_doi': pubdoi,
            'pub_Title': title,
            'pub_authors': authors,
            'publisher': publisher
    })
    
    print(pubdoi)
    
# add new publication info columns to dataframe
pubInfo_df = pd.DataFrame(pub_info)

# loop through new columns to be added to df
for ii in pubInfo_df.columns[1:]:
    # create dictionary of doi, value pairs 
    d = pubInfo_df.set_index('pub_doi')[ii].to_dict()
    
    # use the doi to map the dictionary to crossRef_df_gbif_filtered2_deduplicated
    crossRef_df_gbif_filtered2_deduplicated.loc[:,ii] = crossRef_df_gbif_filtered2_deduplicated.subj_id.map(d)
    
print("Done!")

Done!


In [None]:
# add a step to remove results not from an approved list of publishers/journals?

In [15]:
crossRef_df_gbif_filtered2_deduplicated

Unnamed: 0,relation_type_id,source_id,obj_id,subj_id,subj_work_type_id,dataset_Title,dataset_authors,dataset_date,dataset_date_Type,related_identifiers_list,DataCite_Citation_count,DataCite_Citations_list,dataset_publisher_processed,pub_Title,pub_authors,publisher
0,references,crossref,https://doi.org/10.5285/58a8802721c94c66ae45c3...,https://doi.org/10.1007/s10021-019-00339-z,Info not given,CRU TS4.01: Climatic Research Unit (CRU) Time-...,[University Of East Anglia Climatic Research U...,2017-09-22,Updated,"[{'relationType': 'IsPartOf', 'relatedIdentifi...",11,"[{'id': '10.1007/s00382-021-05992-6', 'type': ...",Centre for Environmental Data Analysis (CEDA),Geographically Structured Growth decline of Re...,"[[Xavier, Serra-Maluquer], [Antonio, Gazol], [...",Springer Science and Business Media LLC
1,references,crossref,https://doi.org/10.5285/07896bb2-7078-468c-b56...,https://doi.org/10.1111/jbi.13501,Info not given,Countryside Survey 1998 vegetation plot data,"[Barr, C.J., Bunce, R.G.H., Gillespie, M.K., H...",2014-03-03,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",4,"[{'id': '10.1016/10.1111/jbi.13501', 'type': '...",Environmental Information Data Centre (EIDC),Oribatid mites show how climate and latitudina...,"[[Tancredi, Caruso], [Ina, Schaefer], [Frank, ...",Wiley
3,references,crossref,https://doi.org/10.5285/8c03f651457f458eaf7b16...,https://doi.org/10.1038/sdata.2018.57,Info not given,Ensemble of Global and EU Regional climate sim...,"[Sparrow, Sarah, Schaller, Nathalie, Massey, N...",2017-06-09,Updated,[],0,[],Centre for Environmental Data Analysis (CEDA),Ensemble of European regional climate simulati...,"[[Nathalie, Schaller], [Sarah N., Sparrow], [N...",Springer Science and Business Media LLC
4,references,crossref,https://doi.org/10.5285/3aa44060-d4fd-453f-9e5...,https://doi.org/10.1038/sdata.2018.265,Info not given,High-resolution time-resolved synchrotron X-ra...,"[Kamaljit Singh, Imperial College London, Mart...",2018-08-23,Available,[],0,[],National Geoscience Data Centre (NGDC),Time-resolved synchrotron X-ray micro-tomograp...,"[[Kamaljit, Singh], [Hannah, Menke], [Matthew,...",Springer Science and Business Media LLC
5,references,crossref,https://doi.org/10.5285/D0E1585D-3417-485F-87A...,https://doi.org/10.1038/sdata.2018.299,Info not given,CRU TS3.21: Climatic Research Unit (CRU) Time-...,"[Harris, Ian]",2013,Issued,[],4,"[{'id': '10.1007/s00382-014-2341-z', 'type': '...",Centre for Environmental Data Analysis (CEDA),"A long-term, temporally consistent, gridded da...","[[A. T., Werner], [M. A., Schnorbus], [R. R., ...",Springer Science and Business Media LLC
6,references,crossref,https://doi.org/10.5285/edf8febfdaad48abb2cbaf...,https://doi.org/10.1038/sdata.2017.191,Info not given,CRU TS4.00: Climatic Research Unit (CRU) Time-...,[University Of East Anglia Climatic Research U...,2017-04-07,Updated,"[{'relationType': 'IsPartOf', 'relatedIdentifi...",3,"[{'id': '10.5194/hess-26-711-2022', 'type': 'd...",Centre for Environmental Data Analysis (CEDA),"TerraClimate, a high-resolution global dataset...","[[John T., Abatzoglou], [Solomon Z., Dobrowski...",Springer Science and Business Media LLC
7,references,crossref,https://doi.org/10.5285/70ee24a5-d19c-4ca8-a1c...,https://doi.org/10.1038/sdata.2018.239,Info not given,Harmonia axyridis invasion: UK distribution da...,"[Roy, H.E., Brown, P.M.J., Harrower, C., Dean,...",2017-05-04,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",1,"[{'id': '10.1038/sdata.2018.239', 'type': 'doi...",Environmental Information Data Centre (EIDC),"Spread of a model invasive alien species, the ...","[[P. M. J., Brown], [D. B., Roy], [C., Harrowe...",Springer Science and Business Media LLC
8,references,crossref,https://doi.org/10.5285/18939865-d863-498b-b4c...,https://doi.org/10.1038/sdata.2018.208,Info not given,Methane transport through an agricultural soil...,"[Shaw, G., Atkinson, B. S., Meredith, W., Snap...",2018-02-09,Submitted,"[{'relationType': 'IsDescribedBy', 'relatedIde...",2,"[{'id': '10.1016/10.1038/sdata.2018.208', 'typ...",Environmental Information Data Centre (EIDC),Methane transport in agricultural soil after i...,"[[George, Shaw], [Brian, Atkinson], [William, ...",Springer Science and Business Media LLC


### Get the citation string (APA format) of the publication that has cited the dataset

In [None]:
# TAKES A LONG TIME - can skip this if you don't want this info
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    
    r = requests.get((pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa"})
    #print(r.status_code)
    citationStrList.append(r.text) # add the citation strings to the list
    
crossRef_df_gbif_filtered2_deduplicated['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [None]:
crossRef_df_gbif_filtered2_deduplicated

In [None]:
# add section comparing counts from crossref, DataCite and scholex
# first build a tool that takes one dataset and compares citation results from each source


In [30]:
# Output csv file

from datetime import date
today = date.today()

crossRef_df_processed_filename = 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = 'dataset_citation_counts_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

dataset_citation_publication_info_2000-01-01_to_2023-03-01_retrieved_03032023.csv
dataset_citation_counts_2000-01-01_to_2023-03-01_retrieved_03032023.csv
