### NERC dataset citations
Code to collect NERC dataset citations from Scholix, CrossRef and DataCite APIs, process and merge the results.
Produces a csv and json with details of the citations for NERC published datasets.

In [1]:
import requests, time, json, re, datetime, os, sys
import numpy as np
import pandas as pd
from math import ceil
from datetime import date
from scholix_fun import getNERCDataDOIs, getScholixDatasetCitations, process_citation_results, getPublicationType, countScholixCitations, getCitationString
import crossRef_fun
from crossRef_fun import getDataCiteInfo, getCrossRefCitations, filterCrossRefResults, mergeDFs, getPublicationInfo
from dataCite_fun import getDataCiteCitations_relationTypes, getPublicationInfo_timeCopy
import exportCitationResultsToCsv
from Results import convertCSVtoJSON


sys.path.insert(0, '..')


## Scholix

### Get the dataset DOIs

In [21]:
# this takes approx ~20 mins
# will print output as it goes along to see progress - in JupyterLab right click on the output and select enable scrolling for outputs
dataCite_df = getNERCDataDOIs.getNERCDataDOIs()

Total records: 4065
Total pages: 163
Status:  200
Page:  1
Status:  200
Page:  2
Status:  200
Page:  3
Status:  200
Page:  4
Status:  200
Page:  5
Status:  200
Page:  6
Status:  200
Page:  7
Status:  200
Page:  8
Status:  200
Page:  9
Status:  200
Page:  10
Status:  200
Page:  11
Status:  200
Page:  12
Status:  200
Page:  13
Status:  200
Page:  14
Status:  200
Page:  15
Status:  200
Page:  16
Status:  200
Page:  17
Status:  200
Page:  18
Status:  200
Page:  19
Status:  200
Page:  20
Status:  200
Page:  21
Status:  200
Page:  22
Status:  200
Page:  23
Status:  200
Page:  24
Status:  200
Page:  25
Status:  200
Page:  26
Status:  200
Page:  27
Status:  200
Page:  28
Status:  200
Page:  29
Status:  200
Page:  30
Status:  200
Page:  31
Status:  200
Page:  32
Status:  200
Page:  33
Status:  200
Page:  34
Status:  200
Page:  35
Status:  200
Page:  36
Status:  200
Page:  37
Status:  200
Page:  38
Status:  200
Page:  39
Status:  200
Page:  40
Status:  200
Page:  41
Status:  200
Page:  42
Status

### Pass the dataset DOIs to the scholex API to get the citations and their respective DOIs

In [22]:
# this takes about 8 mins
# prints output as it goes along to see progress - in JupyterLab right click on the output and select enable scrolling for outputs
scholex_df = getScholixDatasetCitations.getScholixDatasetCitations(dataCite_df)

{'sourcePid': '10.5285/f6b9b2b3-1ad0-4ac1-a19b-bb340427fbf1'}
Status:  200
{'sourcePid': '10.5285/7f5b221e-d251-4bac-9252-16b4553dc345'}
Status:  200
{'sourcePid': '10.5285/c6612cbe-50b3-0cff-e053-6c86abc09f8f'}
Status:  200
{'sourcePid': '10.5285/a29c5465-b138-234d-e053-6c86abc040b9'}
Status:  200
{'sourcePid': '10.5285/00d4b022-972e-7816-e063-6c86abc0048e'}
Status:  200
{'sourcePid': '10.5285/b2b5498c-8eef-44b0-9610-844784f9f16b'}
Status:  200
{'sourcePid': '10.5285/bd375134bd8c4990a1e9eb6d199cc723'}
Status:  200
{'sourcePid': '10.5285/66e39885a60e4b6386752b1a295f268a'}
Status:  200
{'sourcePid': '10.5285/edf66239c70c426e9e9f19da1ac8ba87'}
Status:  200
{'sourcePid': '10.5285/9a62bb8a-dd88-42d2-9af2-25ec338110e6'}
Status:  200
{'sourcePid': '10.5285/76ebdc0b-0eed-4f70-b89e-55e606bcd568'}
Status:  200
{'sourcePid': '10.5285/5dc179dc-f692-49ba-9326-a6893a503f6e'}
Status:  200
{'sourcePid': '10.5285/4fd3350b-f64c-4b7f-b129-4210f14a7e94'}
Status:  200
{'sourcePid': '10.5285/58080f33-884c-

#### Process the citation results

In [23]:
dataCite_df, scholex_df = process_citation_results.process_citation_results(dataCite_df, scholex_df)

In [24]:
# filter out gbif registrant code prefix 10.15468
# scholex_df = scholex_df[~scholex_df.pubID.str.contains("10.15468")]
scholex_df = scholex_df[~scholex_df['pubID'].apply(lambda x: str(x)).str.contains("10.15468")]


### Check the DOIs at DOI.org to determine the type of publication and to check there are no duplicates (by preprints etc) 

In [25]:
# very long 3+ hours
scholex_df = getPublicationType.getPublicationType(scholex_df)

Done!
Pub DOI:  10.1016/j.gr.2022.01.006
Crossref
200
Pub DOI:  10.1038/s43247-022-00574-8
Crossref
200
Pub DOI:  10.3390/rs14112675
Crossref
200
Pub DOI:  10.1080/21664250.2022.2117585
Crossref
200
Pub DOI:  10.1080/17538947.2022.2108923
Crossref
200
Pub DOI:  10.1175/jpo-d-22-0031.1
Crossref
200
Pub DOI:  10.1007/s00300-022-03033-4
Crossref
200
Pub DOI:  10.3389/fmicb.2023.1101902
Crossref
200
Pub DOI:  10.1080/00222933.2022.2074906
Crossref
200
Pub DOI:  10.1038/s41561-022-01003-3
Crossref
200
Pub DOI:  10.1134/s0016852122010058
Crossref
200
Pub DOI:  10.5194/essd-13-4967-2021
Crossref
200
Pub DOI:  10.21203/rs.3.rs-1722308/v1
Crossref
200
Pub DOI:  10.1038/s41598-022-15301-8
Crossref
200
Pub DOI:  10.3390/jmse9121460
Crossref
200
Pub DOI:  10.1080/00288306.2022.2121288
Crossref
200
Pub DOI:  10.3390/rs14122913
Crossref
200
Pub DOI:  10.1016/j.pocean.2022.102849
Crossref
200
Pub DOI:  10.3390/jmse10060793
Crossref
200
Pub DOI:  10.21203/rs.3.rs-2025730/v1
Crossref
200
Pub DOI:  10.3

### Output spreadsheet

In [26]:
today = date.today()

scholex_filename = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/" + 'scholix_citation_publication_info_' + (today.strftime("%d%m%Y")) + '.csv'
scholex_df.to_csv(scholex_filename, index = False)

## CrossRef

In [2]:
# Code for getting results based on a date range

email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "1990-01-01"
today = date.today()
end_date = today.strftime("%Y-%m-%d")
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/"
results_folder_path_name = results_folder_path + "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date

getCrossRefCitations.getCrossRefCitations_byDates(email, prefix, start_date, end_date, results_folder_path)

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&rows=1000&obj-id.prefix=10.5285&from-occurred-date=1990-01-01&until-occurred-date=2023-07-24
Event Data query started...
API query complete  200
output file written to C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/event_data_10.5285_1990-01-01_2023-07-24.json
45958 events found
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=2162cf3a-79be-4bd3-aaa7-1bdb682872b5&rows=1000&obj-id.prefix=10.5285&from-occurred-date=1990-01-01&until-occurred-date=2023-07-24
Event Data query started...
API query complete  200
output file written to C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/NERC_EDS_events_from_1990-01-01_up_to_2023-07-24/page0000.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=ec36cf84-461c-40bb-8342-ad101145241a&rows=1000&obj-id.prefix=10.5285&from-occurred-date=1990-01-01&until-occurred-date=2023-07-24
Event Dat

In [3]:
# filter results 
crossRef_df_gbif_filtered2_deduplicated = filterCrossRefResults.filterCrossRefResults(results_folder_path_name)

In [4]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors, dataCite_df) = getDataCiteInfo.getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

https://doi.org/10.5285/6c6c9203-7333-4d96-88ab-78925e7a4e73 https://doi.org/10.1007/s11368-018-1990-7
API response:  200
https://doi.org/10.5285/4c7fdfa6-f176-4c58-acee-683d5e9d2ed5 https://doi.org/10.5194/gmd-11-1377-2018
API response:  200
https://doi.org/10.5285/18BE23F8-D252-482D-8AF9-5D6A2D40990C https://doi.org/10.1007/s00704-017-2246-y
API response:  200
https://doi.org/10.5285/e1d33b37-f1d4-4234-a0d5-8bf4e657f653 https://doi.org/10.1007/s00484-018-1509-3
API response:  200
https://doi.org/10.5285/4c9613ce-de52-41b1-9fde-7c41f9199686 https://doi.org/10.1007/s00484-018-1509-3
API response:  200
https://doi.org/10.5285/58a8802721c94c66ae45c3baa4d814d0 https://doi.org/10.1007/s00704-018-2476-7
API response:  200
https://doi.org/10.5285/475520d5-bad9-4d84-e053-6c86abc0b01b https://doi.org/10.1007/s10533-017-0350-9
API response:  200
https://doi.org/10.5285/bad1514f-119e-44a4-8e1e-442735bb9797 https://doi.org/10.1007/s11269-018-1914-8
API response:  200
https://doi.org/10.5285/33604

In [5]:
(dataset_df, crossRef_df_gbif_filtered2_deduplicated) = mergeDFs.mergeDFs(dataCite_df,crossRef_df_gbif_filtered2_deduplicated)

In [6]:
crossRef_df_gbif_filtered2_deduplicated = getPublicationInfo.getPublicationInfo(crossRef_df_gbif_filtered2_deduplicated)

https://doi.org/10.1007/s11368-018-1990-7
https://doi.org/10.5194/gmd-11-1377-2018
https://doi.org/10.5194/gmd-11-1343-2018
https://doi.org/10.1007/s00704-017-2246-y
https://doi.org/10.1002/2017WR021682
https://doi.org/10.1007/s00484-018-1509-3
https://doi.org/10.1007/s00484-018-1509-3
https://doi.org/10.1007/s00704-018-2476-7
https://doi.org/10.1007/s00704-018-2392-x
https://doi.org/10.1007/s00704-018-2532-3
https://doi.org/10.1002/joc.5221
https://doi.org/10.1007/s10533-017-0350-9
https://doi.org/10.1007/s11269-018-1914-8
https://doi.org/10.1007/s10584-018-2145-y
https://doi.org/10.1007/s10584-018-2158-6
https://doi.org/10.1007/s00382-018-4183-6
https://doi.org/10.1007/s00382-018-4234-z
https://doi.org/10.1007/s11600-018-0165-7
https://doi.org/10.5194/bg-14-799-2017
https://doi.org/10.5194/bg-14-1181-2017
https://doi.org/10.5194/bg-14-2069-2017
https://doi.org/10.5194/hess-21-4785-2017
https://doi.org/10.5194/hess-21-1189-2017
https://doi.org/10.5194/hess-22-611-2018
https://doi.org/

### Output spreadsheet

In [7]:
crossRef_df_processed_filename = results_folder_path + 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)

## DataCite

In [8]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to']
dataCite_df_relationTypes = getDataCiteCitations_relationTypes.getDataCiteCitations_relationTypes(relation_type_id_list)

is-cited-by
Total records: 19
Total pages: 1
Status:  200
Page:  1
Final page
is-referenced-by
Total records: 1412
Total pages: 2
Status:  200
Page:  1
https://api.datacite.org/events?page%5Bcursor%5D=MTY3NTk0MDk2OTEyOCw1NGQ3ZDZhZi04OTdjLTQzNzMtOGJmMi1iMzczY2VjNjljNzc&page%5Bsize%5D=1000&prefix=10.5285&relation-type-id=is-referenced-by
Status:  200
Page:  2
Final page
is-supplement-to
Total records: 8
Total pages: 1
Status:  200
Page:  1
Final page
Done!


In [9]:
# remove http from DOI url
doi_list = []
for url in dataCite_df_relationTypes['subj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['data_doi'] = doi_list
dataCite_df_relationTypes = dataCite_df_relationTypes.drop(['subj-id'], axis=1)

# doi_list = []
# for url in dataCite_df_relationTypes['obj-id']:
#     doi = url.replace('https://doi.org/','')
#     doi_list.append(doi)
# dataCite_df_relationTypes['pub_doi'] = doi_list # rename to work in getPublicationInfo function 
    
dataCite_df_relationTypes = dataCite_df_relationTypes.rename(columns={"obj-id": "pub_doi_url"})

In [10]:
# drop the rows where the data_doi column value does not start with "10.5285"
dataCite_df_relationTypes = dataCite_df_relationTypes[dataCite_df_relationTypes['data_doi'].str.startswith('10.5285')]

In [11]:
# get dataset metadata 
info_list = []
headers = {'client-id': 'bl.nerc'}
api_url = 'https://api.datacite.org/dois/' 
# for doi in dataCite_df_relationTypes['data_doi']:
for (source_id, relation_type_id, occurred_at, Page_endpoint, data_doi, pub_doi_url) in zip(dataCite_df_relationTypes['source-id'],dataCite_df_relationTypes['relation-type-id'],dataCite_df_relationTypes['occurred-at'],dataCite_df_relationTypes['Page endpoint'],dataCite_df_relationTypes['data_doi'], dataCite_df_relationTypes['pub_doi_url']):
    r = requests.get((api_url + data_doi), headers)
    print(r.status_code, data_doi)
    
    try:
        # process author info
        author_list = []
        for item in r.json()['data']['attributes']['creators']:
            author_list.append(item['name'])

        info_list.append([
            r.json()['data']['attributes']['publisher'],
            data_doi,
            r.json()['data']['attributes']['titles'][0]['title'],
            author_list,
            r.json()['data']['attributes']['publicationYear'],
            r.json()['data']['attributes']['dates'],
            r.json()['data']['attributes']['registered'],
            source_id, relation_type_id, pub_doi_url, occurred_at, Page_endpoint
        ])
    except Exception as e:
        info_list.append(["error",data_doi,"error","error","error","error","error","error","error",pub_doi_url,"error","error"])
        
columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered', 
           'source-id', 'relation-type-id', 'pub_doi_url', 'occurred-at', 'Page endpoint']
dataCite_df = pd.DataFrame(info_list, columns = columns)    
print("Done!")
    

200 10.5285/2641515f-5b76-445c-a936-1da51bf365ad
200 10.5285/6feac38a-5847-46f9-84e4-e7e9d291f935
200 10.5285/db55406b-c9a1-4a9e-88c2-2abbcb4bcad3
200 10.5285/5321bc6e-be35-4ed3-9b56-25598d61ac8f
200 10.5285/c4ecfe25-12f2-453b-ad19-49a19e90ee32
200 10.5285/1f44795b-e596-433c-b69f-caf674880daa
200 10.5285/bc114581-413a-4c62-adb6-556155901f68
200 10.5285/bc114581-413a-4c62-adb6-556155901f68
200 10.5285/bc114581-413a-4c62-adb6-556155901f68
200 10.5285/9ab1541b-e71a-4d75-9907-968d47755e99
200 10.5285/21a8fca5-8eae-48e4-93c0-bc6b4433e34c
200 10.5285/d77dd930-654b-4d09-99bd-8df3b00025a8
200 10.5285/d77dd930-654b-4d09-99bd-8df3b00025a8
200 10.5285/0f074839-1630-4ccd-aa63-84d0da16b28a
200 10.5285/cc1d42de-dfe6-40aa-a1a6-d45cb2fc8293
200 10.5285/dada63fb-c40a-4b13-97ba-c53860881d79
200 10.5285/3952a4fe-683a-42e7-a074-bdec41c8ab16
200 10.5285/3578bae2-4b88-4b2e-93e1-6965dfe1348c
200 10.5285/a1ab8c79-3426-43a4-ab42-6d1b218d1cc6
200 10.5285/0cf552a6-cd62-4da0-8289-8e4bab0a35a8
200 10.5285/65abc40d

In [17]:
# get publication info
dataCite_df_temp = dataCite_df.rename(columns={"pub_doi_url": "subj_id"})
dataCite_df_publication_meta = getPublicationInfo_timeCopy.getPublicationInfo(dataCite_df_temp)

https://doi.org/10.1016/j.scitotenv.2012.05.023 200
https://doi.org/10.1002/2015gl065750 200
https://doi.org/10.17863/cam.20713 200
https://doi.org/10.1111/1365-2656.12728 200
https://doi.org/10.5194/cp-2017-18 200
https://doi.org/10.1002/2016gl068130 200
https://doi.org/10.1029/2007gl032529 200
https://doi.org/10.1029/2009gl040104 200
https://doi.org/10.1029/2009jd012263 200
https://doi.org/10.1002/2015gl065750 200
https://doi.org/10.3189/172756494794587438 200
https://doi.org/10.1029/2007gl032529 200
https://doi.org/10.1029/2009gl040104 200
https://doi.org/10.1111/1365-2656.12798 200
https://doi.org/10.5194/cp-2017-18 200
https://doi.org/10.1029/2018jc013982 200
https://doi.org/10.1175/jcli-d-17-0320.1 200
https://doi.org/10.1111/ele.13129 200
https://doi.org/10.1016/j.jenvrad.2017.06.024 200
https://doi.org/10.1029/2018jc014464 200
https://doi.org/10.5285/4859dc19-e8e9-4148-8c50-cb2ab16dc696 200
https://doi.org/10.5285/65abc40d-e256-414b-8b50-a5569556d1be 200
https://doi.org/10.5285

In [18]:
doi_list = []
for url in dataCite_df_publication_meta['subj_id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_publication_meta['pub_doi'] = doi_list # rename to work in getPublicationInfo function 

In [19]:
#process dataset publisher names
newPublisherLst = []
for dataCentreName in dataCite_df_publication_meta['data_publisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass

    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
dataCite_df_publication_meta['publisher_processed'] = newPublisherLst

dataCite_df_publication_meta = dataCite_df_publication_meta.drop(['data_publisher'], axis=1)
dataCite_df_publication_meta = dataCite_df_publication_meta.rename(columns={'publisher_processed':'data_publisher'})

### Output spreadsheet

In [20]:
dataCite_filename = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/" + 'dataCite_df_events_publication_meta_retrieved_' + (today.strftime("%d%m%Y")) + '.csv'
dataCite_df_publication_meta.to_csv(dataCite_filename, index = False)

# Merge results

In [27]:
scholex_df = pd.read_csv(scholex_filename)
crossref_df = pd.read_csv(crossRef_df_processed_filename)
datacite_df = pd.read_csv(dataCite_filename)


In [29]:
## Remove https bits from crossref df DOIs
# remove url bit from subj_id
crossref_doi_list = []
for url in crossref_df['subj_id']:
    doi = url.replace('https://doi.org/','')
    crossref_doi_list.append(doi)
crossref_df['subj_doi'] = crossref_doi_list

# remove url bit from 'obj_id'
crossref_doi_list = []
for url in crossref_df['obj_id']:
    temp = url.split('/')
    crossref_doi_list.append(temp[3] + "/" + temp[4])
crossref_df['obj_doi'] = crossref_doi_list

In [30]:
# process the three dataframes make columns match
# columns should be:
# can add event_source, dates, publication_publisher columns later
newColumns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'relation_type_id', 'publication_doi', 'publication_type', 'publication_title', 'publication_authors']

crossref_column_list = [crossref_df['dataset_publisher_processed'],crossref_df['obj_doi'],crossref_df['dataset_Title'],crossref_df['dataset_authors'],crossref_df['relation_type_id'],crossref_df['subj_doi'],crossref_df['subj_work_type_id'],crossref_df['pub_Title'],crossref_df['pub_authors']]
crossref_df_newColumns = pd.concat(crossref_column_list, axis = 1)
crossref_df_newColumns.columns = newColumns

# scholex_column_list = [scholex_df[['datasetPublisher']],scholex_df[['datasetDOI']],scholex_df[['datasetTitle']],scholex_df[['datasetAuthors_processed']],scholex_df[['relationshipType']],scholex_df[['pubID']],scholex_df[['PubType']],scholex_df[['pubTitle']],scholex_df[['pubAuthors_processed']]]
scholex_column_list = [scholex_df['datasetPublisher'],scholex_df['datasetDOI'],scholex_df['datasetTitle'],scholex_df['datasetAuthors'],scholex_df['relationshipType'],scholex_df['pubID'],scholex_df['PubType'],scholex_df['pubTitle'],scholex_df['pubAuthors_processed']]
scholex_df_newColumns = pd.concat(scholex_column_list, axis = 1)
scholex_df_newColumns.columns = newColumns

datacite_column_list = [datacite_df['data_publisher'], datacite_df['data_doi'], datacite_df['data_title'], datacite_df['data_authors'], datacite_df['relation-type-id'], datacite_df['pub_doi'], datacite_df['publisher'], datacite_df['pub_Title'], datacite_df['pub_authors']] 
datacite_df_newColumns = pd.concat(datacite_column_list, axis = 1)
datacite_df_newColumns.columns = newColumns


In [31]:
# create single list of data dois 
scholix_doi_list = list(scholex_df_newColumns['data_doi'])
crossref_doi_list = list(crossref_df_newColumns['data_doi'])
datacite_doi_list = list(datacite_df_newColumns['data_doi'])

data_doi_list = scholix_doi_list + crossref_doi_list + datacite_doi_list

# remove duplicates = convert to dict and back to list again auto removes dups
data_doi_list_unique = list( dict.fromkeys(data_doi_list))

In [32]:
# loop through list of data dois, check pub DOI in each of crossref scholex and datacite dfs - compare result
comparison_dicts = []
data_doi_df = pd.DataFrame(data_doi_list_unique)
for doi in data_doi_df[0]:
    doi = doi.replace(")","") # remove rogue brackets
    scholex_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(doi)].index
    scholex_matches = scholex_df_newColumns['publication_doi'].iloc[scholex_indices].tolist()
    
    crossref_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(doi)].index
    crossref_matches = crossref_df_newColumns['publication_doi'].iloc[crossref_indices].tolist()
    
    datacite_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(doi)].index
    datacite_matches = datacite_df_newColumns['publication_doi'].iloc[datacite_indices].tolist()
    
    combined = scholex_matches + crossref_matches + datacite_matches
    combined_unique = list(dict.fromkeys(combined))
    
    inScholix_notIn_crossRef = list(set(scholex_matches) - set(crossref_matches))
    inCrossRef_notIn_scholix = list(set(crossref_matches) - set(scholex_matches))
    inDatacite_notIn_scholix_or_crossRef = list(set(datacite_matches) - set(scholex_matches) - set(crossref_matches))
    
    comparison_dicts.append({
        'data_doi': doi,
        'combined_unique_dois': combined_unique,
        'scholex_pub_dois': scholex_matches,
        'crossref_pub_dois': crossref_matches,
        'datacite_pub_dois': datacite_matches,
        'inScholix_notIn_crossRef':inScholix_notIn_crossRef,
        'inCrossRef_notIn_scholix':inCrossRef_notIn_scholix,
        'inDatacite_notIn_scholix_or_crossRef':inDatacite_notIn_scholix_or_crossRef
    })


In [33]:
# for combined_unique_dois create a final dataframe getting metadata  from dfs
results = []
for dataset in comparison_dicts:
    for pubdoi in dataset['scholex_pub_dois']:
        
        # find index of this pubdoi datadoi pair in scholex_df
        pub_indices = scholex_df_newColumns[scholex_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("scholex_pub_dois")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': scholex_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': scholex_df_newColumns.iloc[index]['data_title'],
            'data_Authors': scholex_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': scholex_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': scholex_df_newColumns.iloc[index]['publication_type'],
            'publication_title': scholex_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': scholex_df_newColumns.iloc[index]['publication_authors'],
            'citation_event_source': 'Scholix'
            })
  
    
    for pubdoi in dataset['inCrossRef_notIn_scholix']:
        # find index of this pubdoi datadoi pair in crossref_df
        pub_indices = crossref_df_newColumns[crossref_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inCrossRef_notIn_scholix")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': crossref_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': crossref_df_newColumns.iloc[index]['data_title'],
            'data_Authors': crossref_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': crossref_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': crossref_df_newColumns.iloc[index]['publication_type'],
            'publication_title': crossref_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': crossref_df_newColumns.iloc[index]['publication_authors'],
            'citation_event_source': 'CrossRef'
            })
        
        
    for pubdoi in dataset['inDatacite_notIn_scholix_or_crossRef']:
        # find index of this pubdoi datadoi pair in datacite_df_newColumns
        pub_indices = datacite_df_newColumns[datacite_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inDatacite_notIn_scholix_or_crossRef")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': datacite_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': datacite_df_newColumns.iloc[index]['data_title'],
            'data_Authors': datacite_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': datacite_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': datacite_df_newColumns.iloc[index]['publication_type'],
            'publication_title': datacite_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': datacite_df_newColumns.iloc[index]['publication_authors'],
            'citation_event_source': 'DataCite'
            })

scholex_pub_dois
pub_indices:  Int64Index([], dtype='int64') data_indices:  Int64Index([ 43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
             56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
             69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
             82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
             95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
            108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
            121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131],
           dtype='int64')
{
  "data_doi": "10.5285/a29c5465-b138-234d-e053-6c86abc040b9",
  "combined_unique_dois": [
    "10.2478/prolas-2022-0039",
    "10.5281/zenodo.4783659",
    "10.2478/jengeo-2020-0008",
    "10.1016/j.quascirev.2022.107377",
    "10.1016/j.jasrep.2020.102658",
    "10.2478/ouacsce-2020-0002",
    "10.1186/s40645-022-00473-8",
    "10.3929/ethz-b-000488372",
    "10.

In [34]:
data_citations = pd.DataFrame.from_dict(results)

### Get the citation string (APA format) of the publication that has cited the dataset

In [35]:
# TAKES A LONG TIME - hours
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in data_citations['publication_doi']:
    if pubDOI.startswith('10.'):
        r = requests.get(('https://doi.org/' + pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa"})
        #print(r.status_code)
        citationStrList.append(r.text) # add the citation strings to the list
    else:
        citationStrList.append('not a doi')
    
data_citations['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [36]:
data_citations

Unnamed: 0,data_Publisher,data_doi,data_Title,data_Authors,relation_type_id,publication_doi,publication_type,publication_title,publication_authors,citation_event_source,PubCitationStr
0,British Oceanographic Data Centre (BODC),10.5285/c6612cbe-50b3-0cff-e053-6c86abc09f8f,The GEBCO_2021 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2021'],IsRelatedTo,10.1016/j.gr.2022.01.006,journal-article,Crustal structure and tectonic evolution of Gr...,"['Priyesh Kunnummal', 'S.P. Anand']",Scholix,"Kunnummal, P., & Anand, S. P. (2022). Crustal ..."
1,British Oceanographic Data Centre (BODC),10.5285/c6612cbe-50b3-0cff-e053-6c86abc09f8f,The GEBCO_2021 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2021'],IsRelatedTo,10.1038/s43247-022-00574-8,journal-article,Projected climate variability of internal wave...,"['Prof. A D Rao', 'Badarvada Yadidya']",Scholix,"Yadidya, B., & Rao, A. D. (2022). Projected cl..."
2,British Oceanographic Data Centre (BODC),10.5285/c6612cbe-50b3-0cff-e053-6c86abc09f8f,The GEBCO_2021 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2021'],IsRelatedTo,10.3390/rs14112675,journal-article,Deep-Sea Seabed Sediment Classification Using ...,"['Qiuhua Tang', 'Jie Li', 'Deqiu Ding', 'Xue J...",Scholix,"Tang, Q., Li, J., Ding, D., Ji, X., Li, N., Ya..."
3,British Oceanographic Data Centre (BODC),10.5285/c6612cbe-50b3-0cff-e053-6c86abc09f8f,The GEBCO_2021 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2021'],IsRelatedTo,10.1080/21664250.2022.2117585,journal-article,A new tsunami hazard assessment for eastern Ma...,"['Payam Momeni', 'Katsuichiro Goda', 'Mohammad...",Scholix,"Momeni, P., Goda, K., Mokhtari, M., & Heidarza..."
4,British Oceanographic Data Centre (BODC),10.5285/c6612cbe-50b3-0cff-e053-6c86abc09f8f,The GEBCO_2021 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2021'],IsRelatedTo,10.1080/17538947.2022.2108923,journal-article,Mapping China’s offshore mariculture based on ...,"['Xiaoliang Liu', 'Zhihua Wang', 'Xiaomei Yang...",Scholix,"Liu, X., Wang, Z., Yang, X., Liu, Y., Liu, B.,..."
...,...,...,...,...,...,...,...,...,...,...,...
3678,Environmental Information Data Centre (EIDC),10.5285/aaada12b-0af0-44ba-8ffc-5e07f410f435,Topsoil physico-chemical properties from the U...,"['Robinson, D.A.', 'Alison, J.', 'Andrews, C.'...",is-referenced-by,10.1111/1365-2745.14039,Wiley,Fifty years of reduction in sulphur deposition...,"[['Fiona M.', 'Seaton'], ['David A.', 'Robinso...",DataCite,"Seaton, F. M., Robinson, D. A., Monteith, D., ..."
3679,Environmental Information Data Centre (EIDC),10.5285/65b23264-2991-4ce2-83e7-7729a3947d75,Biodiversity and environmental conditions alon...,"['Norton, B.A.', 'Mears, M.', 'Warren, P.H.', ...",is-referenced-by,10.1016/j.ufug.2023.127951,Elsevier BV,Biodiversity and environmental stressors along...,"[['Briony A.', 'Norton'], ['Meghann', 'Mears']...",DataCite,"Norton, B. A., Mears, M., Warren, P. H., Siriw..."
3680,Environmental Information Data Centre (EIDC),10.5285/86d07d98-2956-4395-8b02-29dd5d98e6be,"Predicted soil erosion rates, nutrient fluxes ...","['Feeney, C.J.', 'Robinson, D.A.', 'Thomas, A....",is-referenced-by,10.1016/j.scitotenv.2023.161925,Elsevier BV,Agricultural practices drive elevated rates of...,"[['Christopher J.', 'Feeney'], ['David A.', 'R...",DataCite,"Feeney, C. J., Robinson, D. A., Thomas, A. R. ..."
3681,Environmental Information Data Centre (EIDC),10.5285/2dcc08e9-d8e6-4675-b78d-a318efc799d8,Temperature and humidity data supporting use o...,"['Werkmeister, G.A.', 'Galbraith, D.', 'Docher...",is-referenced-by,10.1186/s13007-022-00904-z,Springer Science and Business Media LLC,A novel in situ passive heating method for eva...,"[['Georgina A.', 'Werkmeister'], ['David', 'Ga...",DataCite,"Werkmeister, G. A., Galbraith, D., Docherty, E..."


## Output json and csv file

In [37]:
# Output csv file
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/v1"
file_name = 'dataCitations_allSourcesMerged_retrieved_' + (today.strftime("%d%m%Y"))

data_citations_csvfilename = results_folder_path + file_name + '.csv'
data_citations.to_csv(data_citations_csvfilename, index = False)
print(data_citations_csvfilename)

# write data to 'latest_results' csv file
latest_file_name = results_folder_path + 'latest_results' + '.csv'
data_citations.to_csv(latest_file_name, index = False)


# write data to 'latest_results' json file with data publisher as top level key
latest_file_name_json = results_folder_path + 'latest_results' + '.json'

# Group by 'data_Publisher' and convert the DataFrame to a nested dictionary
nested_dict = data_citations.groupby('data_Publisher').apply(
    lambda x: x.drop('data_Publisher', axis=1).to_dict(orient='records')
).to_dict()

# Convert the nested dictionary to a JSON object
import json
json_object = json.dumps(nested_dict)

# Save the JSON object to a file
with open(latest_file_name_json, 'w') as f:
    f.write(json_object)
    
data_citations_jsonfilename = results_folder_path + file_name + '.json'

with open(data_citations_jsonfilename, 'w') as f:
    f.write(json_object)


C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/dataCitations_allSourcesMerged_retrieved_24072023.csv


Extra cells useful for development, not for use in collecting the citation info:

In [None]:
# Read the JSON file into a pandas DataFrame
with open(latest_file_name_json, 'r') as f:
    json_data = json.load(f)

# Convert the nested dictionary back into a DataFrame
records = []
for publisher, data_list in json_data.items():
    for data_dict in data_list:
        data_dict['data_Publisher'] = publisher
        records.append(data_dict)

df = pd.DataFrame.from_records(records)

# Display the DataFrame
df


In [None]:
# check encoding of characters
result = data_citations[data_citations['publication_doi'] == '10.17976/jpps/2021.01.11']
 
print(result)

In [15]:
from importlib import reload  # Python 3.4+
reload(getPublicationInfo_timeCopy)

<module 'dataCite_fun.getPublicationInfo_timeCopy' from 'C:\\Users\\matnic\\OneDrive\\OneDrive - UKCEH\\Projects\\DataCentre Citations\\dataCite_fun\\getPublicationInfo_timeCopy.py'>

In [15]:
import pickle
# Saving the objects:
with open('objs.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([dataCite_df, scholex_df], f)

In [3]:
import pickle
# Getting back the objects:
with open('objs.pkl') as f:  # Python 3: open(..., 'rb')
    dataCite_df, scholex_df = pickle.load(f)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 38: character maps to <undefined>