### Workflow to collect citation information for datasets published by NERC data centres

In [None]:
import sys
sys.path.insert(0, '..')

# module to run event data queries
import crossRef_fun
import os # some file manipulations
from math import ceil
import json
import pandas as pd
import time
import re
import requests
import datetime

In [1]:
# attempting to write as a function
import sys
sys.path.insert(0, '..')
from crossRef_fun import getCrossRefCitations
email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "2023-01-01"
end_date = "2023-02-01"
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/"
results_folder = results_folder_path + "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date

getCrossRefCitations.getCrossRefCitations_byDates(email, prefix, start_date, end_date, results_folder_path)

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&rows=1000&obj-id.prefix=10.5285&from-occurred-date=2023-01-01&until-occurred-date=2023-02-01
Event Data query started...
API query complete  200
output file written to C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/event_data_10.5285_2023-01-01_2023-02-01.json
543 events found
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=973ad393-9c04-4dfb-aea3-b525ebd0a92d&rows=1000&obj-id.prefix=10.5285&from-occurred-date=2023-01-01&until-occurred-date=2023-02-01
Event Data query started...
API query complete  200
output file written to C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/NERC_EDS_events_from_2023-01-01_up_to_2023-02-01/page0000.json


In [None]:
email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "2023-01-01"
end_date = "2023-03-01" 
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/"
# something to check date is valid
try:
    datetime.date.fromisoformat(start_date)
    datetime.date.fromisoformat(end_date)
except Exception as e:
    print("start_date and/or end_date in wrong format. Should be yyyy-mm-dd")

# filename to save json event data to
filename = results_folder_path + "crossRef_event_data_" + prefix + "_" + start_date + "_" + end_date + ".json"

# Set up the query
ed = crossRef_fun.eventData(email = email, outputFile = filename)
ed.buildQuery({'obj-id.prefix' : prefix, 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}) 

# run the query to determine number of events
ed.runQuery(retry = 5) # scholix = False - can query scholix api as well - worth exploring

# calculate how many pages will need to be iterated over
num_pages = ceil(ed.events.count()/1000)

# set up folder to result jsons into
results_folder = results_folder_path + "crossRef_NERC_EDS_events_from_" + start_date + "_up_to_" + end_date
os.mkdir(results_folder) # not able to overwrite folder of the same name - delete folder and re-write?, or, add a folder with a new name each time?

# find info from all the pages
ed.getAllPages(num_pages, {'rows': 1000, 'obj-id.prefix' : prefix, 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}, fileprefix = (results_folder + '/page')) 

In [None]:
# Code for getting results based on a list of DOIs rather than a date range
import sys
sys.path.insert(0, '..')

import mrced2 # module to run event data queries
from getDataCiteInfo import getDataCiteInfo
import os # some file manipulations
from math import ceil
import json
import pandas as pd
import time
import re
import requests
import datetime

email = "matnic@ceh.ac.uk"

# import list of DOIs to find event data for
doi_list = pd.read_csv("marika_doi_list.csv", header = None)

# set up folder to result jsons into
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/"
results_folder_name = "NERC_EDS_events_from_doi_list_scholix"
results_folder = results_folder_path + results_folder

os.makedirs(results_folder, exist_ok=True)

for count, doi in enumerate(doi_list[0]):
    print(doi)

    fileDoi = doi.replace("/", "_")
    
    # Set up the query
    # filename to save json event data to
    filename = f"{results_folder}/event_data_{fileDoi}_{count}.json"
    ed = mrced2.eventData(email=email, outputFile=filename)
    ed.buildQuery({'obj-id': doi})
    
    # run the query to determine number of events
    ed.runQuery(retry=5) # scholix = False - can query scholix api as well - worth exploring
    
    # calculate how many pages will need to be iterated over
    num_pages = ceil(ed.events.count()/1000)
    
    # find info from all the pages
    ed.getAllPages(num_pages, {'rows': 1000, 'obj-id': doi}, fileprefix=f"{results_folder}/{fileDoi}_page{count}") # fileDoi to give each result json a unique name, otherwise it writes over previous results 

print('Done!')


In [None]:
# instance of a class to interpret the events
jd1 = crossRef_fun.eventRecord()

# get all the filenames
files = os.listdir(results_folder)

# load the json event data from multiple files
jd1.mergeJsons(files, folder = results_folder)

## filter out twitter wikipedia etc - later add options to include these info
filters = {"source_id" : ['twitter', 'wikipedia', 'newsfeed', 'wordpressdotcom', 'reddit-links']}
filtered_info = jd1.filter(filters, mode = 'NOT')

# collect relevant citation info for NERC project
citationInfo = filtered_info.collectCitationInfo()

# convert to dataframe
crossRef_df = pd.DataFrame(citationInfo)

# filter out gbif registrant code prefix 10.15468
crossRef_df_gbif_filtered = crossRef_df[~crossRef_df.subj_id.str.contains("10.15468")]

# filter out relationship_type_id values that we don't want - is_referenced_by, discusses
crossRef_df_gbif_filtered2 = crossRef_df_gbif_filtered[~crossRef_df_gbif_filtered.relation_type_id.str.contains("is_referenced_by|discusses|is_new_version_of|is_supplemented_by|is_previous_version_of")]

# remove duplicate subj_ids for each obj_id - e.g. 10.5285/a7f28dea-64f7-43b5-bc39-a6cfcdeefbda has multiple references from 10.5285/65140444-b5fa-4a5e-9ab4-e86c106051e2
# find rows where obj_id and subj_id are the same - should I match any other columns?
dups = crossRef_df_gbif_filtered2.duplicated(subset=['obj_id', 'subj_id'])
crossRef_df_gbif_filtered2_deduplicated = crossRef_df_gbif_filtered2.drop(crossRef_df_gbif_filtered2[dups].index)



In [None]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors, dataCite_df) = getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

Pass citation info to datacite API to collect relevant info on the datasets, data centres etc

In [None]:
errors
dataCite_df

In [None]:
# # What to do with publication type information? if posted content probably want to ignore?
# crossRef_df_gbif_filtered2_deduplicated['obj_id']

# # for each dataset count each type of publication that cites it
# for data_doi in crossRef_df_processed


In [None]:
# merge crossRef_df and dataCite_df to get dataset info in crossref_df
# for each column create a mapping pair of dataset DOI and that column name, but skips first column 'Dataset_DOI' in loop
for ii in dataCite_df.columns[1:]: # 
    
    # create dictionary of data_doi, value pairs 
    d = dataCite_df.set_index('dataset_DOI')[ii].to_dict()
    
    # use the data doi to map the dictionary to the crossref_df
    crossRef_df_gbif_filtered2_deduplicated.loc[:,ii] = crossRef_df_gbif_filtered2_deduplicated.obj_id.map(d)

    
# create data frame that just lists each dataset and has citations counts from crossref, scholex, datacite etc
# remove rows from crossref with duplicated obj_id
dups = crossRef_df_gbif_filtered2_deduplicated.duplicated('obj_id')
dataset_df = crossRef_df_gbif_filtered2_deduplicated.drop(crossRef_df_gbif_filtered2_deduplicated[dups].index)
dataset_df = dataset_df.drop(['relation_type_id', 'source_id', 'subj_id', 'subj_work_type_id'], axis = 1) # 'subj_work_type_id'

# count how many times each dataset DOI appears in crossRef_df_processed and add this number to dataset_df
crossRef_citation_counts = crossRef_df_gbif_filtered2_deduplicated['obj_id'].value_counts()

# need counts that include or exclude different relation_type_ids and subj_work_type_id

# create dictionary of data_doi, crossRef_citation_counts 
d = crossRef_citation_counts.to_dict()

# use the data doi to map the dictionary to the dataset_df
dataset_df['crossRef_citation_count'] = dataset_df.obj_id.map(d)



In [None]:
# # pass publication DOIs to DOI.org to determine type of publication using checkDOIpubType function defined above
# pubTypeList = []
# pubDOI = []

# for count, doi in enumerate(crossRef_df_processed['subj_id']):
#     doiComponents = doi.split('/')[-2:]
#     doi = doiComponents[0] + "/" + doiComponents[1]
#     # print(type(doi))
#     pubType = checkDOIpubType(doi)
#     pubTypeList.append(pubType)
#     pubDOI.append(doi)
    
#     # add code to catch retries limit exceeded - might need to be in function itself?
#     # e.g. https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
    
#     time.sleep(0.5)
#     # if count % 200 == 0: # if count is a multiple of 200 wait for a bit
#     #         time.sleep(180)

# print('Done!')

# crossRef_df_processed['publicationType'] = pubTypeList
# crossRef_df_processed['publication_DOI'] = pubDOI


# # split publicationType column into publicationType1 and publicationSubType columns
# crossRef_df_processed[['publicationType1','publicationSubType']] = pd.DataFrame(crossRef_df_processed['publicationType'].tolist(), index=crossRef_df_processed.index)


# # determine publication type for records where DOI.org API call failed - ~10 mins
# newPubTypeList = []

# # get the unknown rows from scholex_df pubtype column and the pubDOI
# for pubType, pubDOI in zip(crossRef_df_processed['publicationType1'],crossRef_df_processed['publication_DOI']):
    
#     if pubType == 'unknown':

#         # need a catcher to make sure pubDOI is a single ID - in the cases where there was no DOI there is more than one ID recorded
#         if len(pubDOI) < 100: # if the length is less than 100 (arbitrarily) then it will be a single DOI
#             pass
#         else: # if there is no DOI skip this record
#             newPubTypeList.append(pubType) # leave it the same
#             #newPubType = pubType 
#             continue
        
#         # determine if crossref or datacite supplies the DOI
#         print('Pub DOI: ', pubDOI)
#         r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
#         DOIregistry = r.json()[0]['RA']
#         print(DOIregistry)

#         # query the crossref or datacite API
#         if DOIregistry == 'DataCite':
#             # ask the Datacite API what type of publication
#             r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

#         elif DOIregistry == 'Crossref':
#             r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
#             print(r.status_code)
#             newPubTypeList.append(r.json()['message']['type']) # could also add 'subtype':r.json()['message']['subtype']
#         else:
#             print('Unknown DOI registry')
        
#     else: 
#         newPubTypeList.append(pubType) # in the cases where the pubType is not unknown keep it the same

# crossRef_df_processed['newPubTypeList'] = newPubTypeList
# print("Done")

In [None]:
pubDOI = '10.1007/s11368-018-1990-7'
r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
DOIregistry = r.json()[0]['RA']
print(DOIregistry)

In [None]:
# query the crossref or datacite API
if DOIregistry == 'DataCite':
    # ask the Datacite API what type of publication
    r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
    print(r.status_code)
    newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

In [None]:
r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
print(r.status_code)
r.json()['message']

In [None]:

# add a section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
doi_url = 'https://doi.org/10.5194/gmd-11-1377-2018'
print(doi_url)
r = requests.get(doi_url, headers={"Accept": "application/json"}) # sometimes throws up unexpected errors

# sometimes the API call returns info in a format that is not JSON 
r.json()

In [None]:
# very slow if crossRef_df_gbif_filtered2_deduplicated is large - about 1 hour for everything from year 2000
# CHANGE THIS TO CROSSREF API BECAUSE DOI.ORG API IS bad
# section to get publication info from DOI.org api to add to crossRef_df_gbif_filtered2_deduplicated? e.g. title, authors etc
pub_info = []
for pubdoi in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    r = requests.get(pubdoi, headers={"Accept": "application/json"})
    
    try:
        title = r.json()['title']
    except:
        title = "Info not given"
    
    try:           
        authors = []
        for jj in range(len(r.json()['author'])):
            authors.append([r.json()['author'][jj]['given'],r.json()['author'][jj]['family']])
    except:
        authors = "Info not given"
    
    try:
        publisher = r.json()['publisher']
    except:
        publisher = "Info not given"
        
    pub_info.append({
            'pub_doi': pubdoi,
            'pub_Title': title,
            'pub_authors': authors,
            'publisher': publisher
        # add publication date to this - in order to check if this is before the dataset publication date - could be a way to filter out dodgy results
    })
    
    print(pubdoi)
    
# add new publication info columns to dataframe
pubInfo_df = pd.DataFrame(pub_info)

# loop through new columns to be added to df
for ii in pubInfo_df.columns[1:]:
    # create dictionary of doi, value pairs 
    d = pubInfo_df.set_index('pub_doi')[ii].to_dict()
    
    # use the doi to map the dictionary to crossRef_df_gbif_filtered2_deduplicated
    crossRef_df_gbif_filtered2_deduplicated.loc[:,ii] = crossRef_df_gbif_filtered2_deduplicated.subj_id.map(d)
    
print("Done!")

In [None]:
# add a step to remove results not from an approved list of publishers/journals?

# how to make a list of approved publishers?

In [None]:
crossRef_df_gbif_filtered2_deduplicated

### Get the citation string (APA format) of the publication that has cited the dataset

In [None]:
# TAKES A LONG TIME - can skip this if you don't want this info
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in crossRef_df_gbif_filtered2_deduplicated['subj_id']:
    
    r = requests.get((pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa"})
    #print(r.status_code)
    citationStrList.append(r.text) # add the citation strings to the list
    
crossRef_df_gbif_filtered2_deduplicated['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [None]:
crossRef_df_gbif_filtered2_deduplicated

In [None]:
# add section comparing counts from crossref, DataCite and scholex
# first build a tool that takes one dataset and compares citation results from each source


In [None]:
# Output csv file

from datetime import date
today = date.today()

crossRef_df_processed_filename = 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = 'dataset_citation_counts_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

In [None]:
# Output csv file if based on list of DOIs rather than dates

from datetime import date
today = date.today()
doi_list_name = "marika_doi_list_scholix"

crossRef_df_processed_filename = 'dataset_citation_publication_info_' + doi_list_name + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)
print(crossRef_df_processed_filename)

dataset_filename = 'dataset_citation_counts_' + doi_list_name + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
dataset_df.to_csv(dataset_filename, index = False)
print(dataset_filename)