## Workflow for counting citations of datasets held by NERC data centres


In [58]:
import requests
import numpy as np
import pandas as pd
import time

In [18]:
# for those revisiting this notebook
# load the dataframes - in the case that you want to run only one part of the code but need a previously collected dateframe for it
# dataCite_df_fp = '' # add the filepath including file extension
# scholex_df_fp = '' # add the filepath including file extension
# dataCite_df = pd.read_csv(DataCite_df_fp)
# scholex_df = pd.read_csv(scholex_df_fp)
dataCite_df = pd.read_csv('dataset_citation_counts_22112022.csv')
scholex_df = pd.read_csv('dataset_citation_publication_info_22112022.csv')

### Define a function to collect the dataset DOIs from the DataCite API

In [59]:
def getNERCDataDOIs():
    dataCiteInfo = []  # create an empty list in which all the DataCite info will be placed

    # send a request to get initial info from DataCite
    headers = {'client-id': 'bl.nerc', 'page': '1'} # defining this inside the request function doesn't work
    r = requests.get('https://api.datacite.org/dois', headers)

    # determine the total number of pages and dataset records
    totalPages = r.json()['meta']['totalPages']
    totalRecords = r.json()['meta']['total']
    print("Total records:", totalRecords)
    print("Total pages:", totalPages)

    # create array from 1 to total number of pages to loop through
    pages = np.arange(1,totalPages+1)
    # set next page url
    next_url = r.json()['links']['next']

    #loop through pages
    for p in pages:
        if p == 1:
            url = 'https://api.datacite.org/dois?page=1'
            # last page url number 130
            #url = 'https://api.datacite.org/dois?client-id=bl.nerc&page%5Bnumber%5D=130&page%5Bsize%5D=25&client-id=bl.nerc'
        else:
            url = next_url

        # make the API request and print the status code in case of an error
        headers = {'client-id': 'bl.nerc'}
        r = requests.get(url,headers)
        print('Status: ', r.status_code)
        
        # determine status code, 
        if r.status_code == 200:
            pass
        elif r.status_code == 503: # if 503 error the server is overloaded, wait a bit then try again
            print('Waiting 2 mins for server to recover...?')
            time.sleep(120)
            r = requests.get(url, headers)
            print('Second attempt: ', r.status_code)
            if r.status_code == 503:
                print("Server not recovered, try again later")
                break
            else:
                pass
        else:
            print("Something else has gone wrong!")
            break

        print('Page: ', r.json()['meta']['page'])

        # determine number of dataset records on this page
        numRecords = np.arange(0,(len(r.json()['data'])))

        # loop through records on this page - this could be a separate function to call
        for recordNumber in numRecords:
            # add info to dataCiteInfo list
            dataCiteInfo.append([r.json()['data'][recordNumber]['attributes']['publisher'],
                         r.json()['data'][recordNumber]['attributes']['doi'],
                         r.json()['data'][recordNumber]['attributes']['titles'],       
                         r.json()['data'][recordNumber]['attributes']['dates'], # remove this? Or change to just one date?
                         r.json()['data'][recordNumber]['attributes']['creators'],
                         r.json()['meta']['page'],
                         r.json()['links']['self']])

        time.sleep(1) # wait for a bit, doing it too quickly may be overloading the server? often gives a 503 status error
        if p % 10 == 0: # if p is a multiple of 10 wait for a bit longer
            time.sleep(30)

        # for handling last page error - the code works by determining the endpoint of the next page and calling that, but needs an error catcher for the last page
        try:
            # determine url of next page
            next_url = r.json()['links']['next']
        except:
            print("Final page")
            break
        else:
            continue
        
    # put the collected information into a pandas dataframe    
    column_names = ["publisher", "datasetDOI_attribute", "title_unprocessed", "dates", "creators", "page_number", "Page endpoint"]
    dataCite_df = pd.DataFrame(dataCiteInfo, columns = column_names)
    
    # process the title column
    title_lst = []
    for x in dataCite_df['title_unprocessed']:
        if len(x) == 0:
            title_lst.append('No title given')
        else:
            title_lst.append(x[0]['title'])

    #add processed title list to dataframe and delete unprocessed
    dataCite_df['title'] = title_lst
    dataCite_df = dataCite_df.drop(['title_unprocessed'], axis = 1)
    print('Done!')
    
    return dataCite_df

### Define a function to pass dataset DOIs to the Scholexplorer API and count the citations in journal articles etc

In [60]:
def getDatasetCitations(dataCite_df):
    scholexInfo = [] # create an empty list in which all the Scholex info will be placed
    
    dataDOIs = list(dataCite_df['datasetDOI_attribute'])
    dataPublisher = list(dataCite_df['publisher'])
    dataTitle = list(dataCite_df['title'])
    dataAuthors = list(dataCite_df['creators'])
    

    # loop through info from the DataCite dataframe
    for doi, publisher, title, authors in zip(dataDOIs, dataPublisher, dataTitle, dataAuthors):
        headers = {'sourcePid': doi}
        r = requests.get('http://api.scholexplorer.openaire.eu/v2/Links', headers)
        print(headers)
        print('Status: ',  r.status_code)
               
        # scholex API holds no further info if no citations, therefore need a catcher to skip to the next record here
        if r.json()['totalLinks'] == 0:
            continue 
        else:
            numCitations = np.arange(0,(len(r.json()['result']))) # create an array from 1 to the total number of links (of all types) to loop through
            count = 0

            # loop through records on this page to count the number of IsReferencedBy citations 
            for citationNum in numCitations:
                if r.json()['result'][citationNum]['RelationshipType']['Name'] == "IsReferencedBy": # we only want to use this type of relationships
                    count = count + 1 # counting the citations ('links') that are "IsReferencedBy"
                else:
                    pass
            
            # loop through records again to collect info this time - this needs to be a seperate block in order to add the completed citation count 
            for citationNum in numCitations:
                # there can be multiple ID schemes so we only want DOI of the publication:
                for IDinfo in r.json()['result'][citationNum]['target']['Identifier']: # for each ID type for this publication e.g. DOI, pubmed etc
                    if IDinfo['IDScheme'] == 'doi': # if its DOI, collect it and then skip to next part of code
                        pubDOI =  IDinfo['ID']
                        break
                    else: # if there's no DOI then collect all the ID (to be looked at manually later)
                        pubDOI =  r.json()['result'][citationNum]['target']['Identifier']

                if r.json()['result'][citationNum]['RelationshipType']['Name'] == "IsReferencedBy":    
                    scholexInfo.append([count,
                                 r.json()['result'][citationNum]['RelationshipType']['Name'],
                                 r.json()['result'][citationNum]['target']['Title'],
                                 r.json()['result'][citationNum]['target']['PublicationDate'],
                                 r.json()['result'][citationNum]['target']['Creator'],
                                 pubDOI, 
                                 doi, publisher, title, authors]) # info from dataCite_df
                else:
                    continue
                    
    # put the collected info into a dataframe                
    column_names = ["citations", "relationshipType", "pubTitle", "pubDate", "pubAuthors", "pubID", "datasetDOI", "datasetPublisher", "datasetTitle", "datasetAuthors"]
    scholex_df = pd.DataFrame(scholexInfo, columns = column_names) 
    print('Done!')
    return scholex_df
    

### Get the dataset DOIs

In [61]:
# this takes approx ~20 mins
# will print output as it goes along to see progress - in JupyterLab right click on the output and select enable scrolling for outputs
dataCite_df = getNERCDataDOIs()

Total records: 3603
Total pages: 145
Status:  200
Page:  1
Status:  200
Page:  2
Status:  200
Page:  3
Status:  200
Page:  4
Status:  200
Page:  5
Status:  200
Page:  6
Status:  200
Page:  7
Status:  200
Page:  8
Status:  200
Page:  9
Status:  200
Page:  10
Status:  200
Page:  11
Status:  200
Page:  12
Status:  200
Page:  13
Status:  200
Page:  14
Status:  200
Page:  15
Status:  200
Page:  16
Status:  200
Page:  17
Status:  200
Page:  18
Status:  200
Page:  19
Status:  200
Page:  20
Status:  200
Page:  21
Status:  200
Page:  22
Status:  200
Page:  23
Status:  200
Page:  24
Status:  200
Page:  25
Status:  200
Page:  26
Status:  200
Page:  27
Status:  200
Page:  28
Status:  200
Page:  29
Status:  200
Page:  30
Status:  200
Page:  31
Status:  200
Page:  32
Status:  200
Page:  33
Status:  200
Page:  34
Status:  200
Page:  35
Status:  200
Page:  36
Status:  200
Page:  37
Status:  200
Page:  38
Status:  200
Page:  39
Status:  200
Page:  40
Status:  200
Page:  41
Status:  200
Page:  42
Status

In [65]:
# see the table
# dataCite_df
dataCite_df['creators'][0]

[{'name': 'GEBCO Bathymetric Compilation Group 2020',
  'affiliation': [],
  'nameIdentifiers': []}]

### Pass the dataset DOIs to the scholex API to get the citations and their respective DOIs


In [66]:
# this takes about 8 mins
# prints output as it goes along to see progress - in JupyterLab right click on the output and select enable scrolling for outputs
scholex_df = getDatasetCitations(dataCite_df)

{'sourcePid': '10.5285/a29c5465-b138-234d-e053-6c86abc040b9'}
Status:  200
{'sourcePid': '10.5285/0f90d926-99ce-43c9-b536-0c7791d1728b'}
Status:  200
{'sourcePid': '10.5285/fa5d606c-dc95-47ee-9016-7a82e446f2f2'}
Status:  200
{'sourcePid': '10.5285/925ac4ec-2a9d-461a-bfaa-6314eb0888c8'}
Status:  200
{'sourcePid': '10.5285/2fd95199-365e-4da1-ae26-3b6d48b3e6ac'}
Status:  200
{'sourcePid': '10.5285/f64815ec-4077-4432-9f55-0ce230f46029'}
Status:  200
{'sourcePid': '10.5285/91523ff9-d621-46b3-87f7-ffb6efcd1847'}
Status:  200
{'sourcePid': '10.5285/6aeba247-52d1-4e84-949f-603742af40c1'}
Status:  200
{'sourcePid': '10.5285/a72a50c6-a829-4e12-9f9a-5a683a1acc4a'}
Status:  200
{'sourcePid': '10.5285/92824e3ec2e44a58b10048df3209b99c'}
Status:  200
{'sourcePid': '10.5285/8cd7e7bb-9a20-05d8-e053-6c86abc012c2'}
Status:  200
{'sourcePid': '10.5285/0313090c-373e-4e2e-97f2-6cd0d4138e75'}
Status:  200
{'sourcePid': '10.5285/a32f775b-34dd-4f31-aafa-f88450eb7a90'}
Status:  200
{'sourcePid': '10.5285/9ab528

In [76]:
# see the table
scholex_df['pubAuthors'][0]

[{'Name': 'Fuchs, Matthias', 'Identifier': []},
 {'Name': 'Palmtag, Juri', 'Identifier': []},
 {'Name': 'Juhls, Bennet', 'Identifier': []},
 {'Name': 'Overduin, Pier Paul', 'Identifier': []},
 {'Name': 'Grosse, Guido', 'Identifier': []},
 {'Name': 'Abdelwahab, Ahmed', 'Identifier': []},
 {'Name': 'Bedington, Michael', 'Identifier': []},
 {'Name': 'Sanders, Tina', 'Identifier': []},
 {'Name': 'Ogneva, Olga', 'Identifier': []},
 {'Name': 'Fedorova, Irina V', 'Identifier': []},
 {'Name': 'Zimov, Nikita S', 'Identifier': []},
 {'Name': 'Mann, Paul James', 'Identifier': []},
 {'Name': 'Strauss, Jens', 'Identifier': []}]

### Add the citation count collected from Scholex for each dataset to the DataCite dataframe

In [77]:
#create dictionary of dataset DOIs and citations for that DOI from scholex_df
d = scholex_df.set_index('datasetDOI')['citations'].to_dict()

# map that dictionary of DOI - citation pairs to the DOIs in  dataCite_df
dataCite_df['citations'] = dataCite_df.datasetDOI_attribute.map(d)

#convert nans to 0 and floats to intergers
dataCite_df['citations'] = dataCite_df['citations'].fillna(0)
dataCite_df['citations'] = dataCite_df['citations'].astype('int')

#### Process dataframes and tidy up some columns

In [78]:
# DataCite_df - process publisher names to result in one consistent name for each data centre
# this may need reviewing periodically as it may not catch every variation of data centre names
newPublisherLst = []
for dataCentreName in dataCite_df['publisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass
    
    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
dataCite_df['publisher_processed'] = newPublisherLst

#  scholex_df - process publisher names to result in one consistent name for each data centre
# this may need reviewing periodically as it may not catch every variation of data centre names
newPublisherLst = []
for dataCentreName in scholex_df['datasetPublisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass
    
    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
scholex_df['publisher_processed'] = newPublisherLst

In [86]:
#  dataCite_df - re-order and rename old publisher column
dataCite_df = dataCite_df[['publisher_processed', 'title', 'citations', 'creators', 'datasetDOI_attribute', 'dates', 'page_number',
       'Page endpoint']]
dataCite_df = dataCite_df.rename({'publisher_processed': 'publisher'}, axis=1)

# scholex_df - remove old publisher column and rename new one
scholex_df = scholex_df.drop(['datasetPublisher'], axis=1)
scholex_df = scholex_df.rename({'publisher_processed': 'datasetPublisher'}, axis=1)

### Check the DOIs at DOI.org to determine the type of publication and to check there are no duplicates (by preprints etc) 

In [11]:
# take DOI of each publication that cites datasets and check what type of publication it is
def checkDOIpubType(publicationDOI):
    doi_url = 'https://doi.org/' + publicationDOI
    
    try:
        r = requests.get(doi_url, headers={"Accept": "application/json"}) # sometimes throws up unexpected errors
        print(r.status_code, publicationDOI)

        # sometimes the API call returns info in a format that is not JSON 
        pubType = r.json()['type'] # check if returned info is in json format and if there is publication type info
            
        if 'subtype' in r.json() == True:
            pubSubType = r.json()['subtype']
            print(pubType, ': ', publicationDOI)
        else:
            print(pubType, ': ', publicationDOI)
            pubSubType = 'None'
            pass 
    except: # if unexpected error or returned info is not in json format put pubtype as unknown to be dealt with later
        print('no json: ', publicationDOI)
        pubType = 'unknown'
        pubSubType = 'unknown'
          
    return pubType, pubSubType

In [12]:
# takes > 60 mins
# pass publication DOIs to DOI.org to determine type of publication using checkDOIpubType function defined above
pubTypeList = []

for count, doi in enumerate(scholex_df['pubID']):
    pubType = checkDOIpubType(doi)
    pubTypeList.append(pubType)
    
    # add code to catch retries limit exceeded - might need to be in function itself?
    # e.g. https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
    
    time.sleep(1)
    if count % 200 == 0: # if count is a multiple of 200 wait for a bit
            time.sleep(180)

print('Done!')

scholex_df['publicationType'] = pubTypeList

200 10.5285/b809a040-8305-4bc5-baff-76aa2b823734
no json:  10.5285/b809a040-8305-4bc5-baff-76aa2b823734
200 10.1038/s41598-018-29286-w
journal-article :  10.1038/s41598-018-29286-w
200 10.1016/j.ecolind.2016.01.060
journal-article :  10.1016/j.ecolind.2016.01.060
200 10.1007/s10531-020-01938-2
journal-article :  10.1007/s10531-020-01938-2
200 10.1111/jfr3.12464
journal-article :  10.1111/jfr3.12464
200 10.1016/j.agee.2019.106619
journal-article :  10.1016/j.agee.2019.106619
200 10.5194/bg-15-1497-2018
journal-article :  10.5194/bg-15-1497-2018
200 10.1080/21606544.2019.1611481
journal-article :  10.1080/21606544.2019.1611481
200 10.3390/su13031456
journal-article :  10.3390/su13031456
200 10.1016/j.scitotenv.2019.02.228
journal-article :  10.1016/j.scitotenv.2019.02.228
200 10.1007/s10980-019-00822-x
journal-article :  10.1007/s10980-019-00822-x
200 10.1002/pan3.10042
journal-article :  10.1002/pan3.10042
200 10.1002/ece3.5613
journal-article :  10.1002/ece3.5613
200 10.1016/j.scitoten

In [13]:
# split publicationType column into publicationType1 and publicationSubType columns
scholex_df[['publicationType1','publicationSubType']] = pd.DataFrame(scholex_df['publicationType'].tolist(), index=scholex_df.index)

In [14]:
# determine publication type for records where DOI.org API call failed - ~10 mins
newPubTypeList = []

# get the unknown rows from scholex_df pubtype column and the pubDOI
for pubType, pubDOI in zip(scholex_df['publicationType1'],scholex_df['pubID']):
    
    if pubType == 'unknown':

        # need a catcher to make sure pubDOI is a single ID - in the cases where there was no DOI there is more than one ID recorded
        if len(pubDOI) < 100: # if the length is less than 100 (arbitrarily) then it will be a single DOI
            pass
        else: # if there is no DOI skip this record
            newPubTypeList.append(pubType) # leave it the same
            #newPubType = pubType 
            continue
        
        # determine if crossref or datacite supplies the DOI
        print('Pub DOI: ', pubDOI)
        r = requests.get(('https://doi.org/doiRA/' + pubDOI), headers={"Accept": "application/json"})
        DOIregistry = r.json()[0]['RA']
        print(DOIregistry)

        # query the crossref or datacite API
        if DOIregistry == 'DataCite':
            # ask the Datacite API what type of publication
            r = requests.get(('https://api.datacite.org/dois/' + pubDOI), headers = {'client-id': 'bl.nerc'})
            print(r.status_code)
            newPubTypeList.append(r.json()['data']['attributes']['types']['citeproc']) # is citeproc the correct one to use?

        elif DOIregistry == 'Crossref':
            r = requests.get(('https://api.crossref.org/works/'  + pubDOI), headers={"Accept": "application/json"})
            print(r.status_code)
            newPubTypeList.append(r.json()['message']['type']) # could also add 'subtype':r.json()['message']['subtype']
        else:
            print('Unknown DOI registry')
        
    else: 
        newPubTypeList.append(pubType) # in the cases where the pubType is not unknown keep it the same

scholex_df['newPubTypeList'] = newPubTypeList
print("Done")

Pub DOI:  10.5285/b809a040-8305-4bc5-baff-76aa2b823734
DataCite
200
Pub DOI:  10.5285/9c972cfb-0ffa-4144-a943-da6eb82431d2
DataCite
200
Pub DOI:  10.5285/d9a74ea7-2a1a-4068-847e-5bc9f51947c5
DataCite
200
Pub DOI:  10.5285/1467b446-54eb-45c1-8a31-f4af21e60e60
DataCite
200
Pub DOI:  10.5281/zenodo.3268427
DataCite
200
Pub DOI:  10.5281/zenodo.3746110
DataCite
200
Pub DOI:  10.5285/2059acfe-bdf6-48dc-a30f-be2ef7555b37
DataCite
200
Pub DOI:  10.15468/dl.wcnfng
DataCite
200
Pub DOI:  10.15468/dl.p53eb8
DataCite
200
Pub DOI:  10.15468/dl.c3q7va
DataCite
200
Pub DOI:  10.15468/dl.nha85n
DataCite
200
Pub DOI:  10.15468/dl.jjvbbv
DataCite
200
Pub DOI:  10.15468/dl.upchfd
DataCite
200
Pub DOI:  10.15468/dl.ndqsnj
DataCite
200
Pub DOI:  10.15468/dl.3x28gc
DataCite
200
Pub DOI:  10.15468/dl.kx9xd9
DataCite
200
Pub DOI:  10.15468/dl.zx8ebd
DataCite
200
Pub DOI:  10.15468/dl.y8apfr
DataCite
200
Pub DOI:  10.15468/dl.mmy4mz
DataCite
200
Pub DOI:  10.15468/dl.yxmw3x
DataCite
200
Pub DOI:  10.15468/dl.

In [15]:
# tidy up scholex_df so we only have newPubTypeList
scholex_df = scholex_df.drop(['publicationType', 'publicationType1', 'publicationSubType'], axis=1) # remove uneccessary columns
scholex_df = scholex_df.rename(columns={"newPubTypeList": "PubType"})
scholex_df

Unnamed: 0,citations,relationshipType,pubTitle,pubDate,pubID,datasetDOI,datasetTitle,datasetPublisher,PubType
0,1,IsReferencedBy,"Microseismic icequake catalogue, Rutford Ice S...",2020-11-13,10.5285/b809a040-8305-4bc5-baff-76aa2b823734,10.5285/54757cbe-0b13-4385-8b31-4dfaa1dab55e,Rutford Ice Stream bed elevation DEM from rada...,Polar Data Centre (PDC),dataset
1,1,IsReferencedBy,Contiguous US summer maximum temperature and h...,2018-07-24,10.1038/s41598-018-29286-w,10.5285/c311c7948e8a47b299f8f9c7ae6cb9af,CRU TS3.25: Climatic Research Unit (CRU) Time-...,Centre for Environmental Data Analysis (CEDA),journal-article
2,1,IsReferencedBy,Providing information on environmental change:...,2016-09-01,10.1016/j.ecolind.2016.01.060,10.5285/e1c292b0-12c7-4998-b48f-7a83a203e604,UK Environmental Change Network (ECN) spittle ...,Environmental Information Data Centre (EIDC),journal-article
3,11,IsReferencedBy,"An empirical, cross-taxon evaluation of landsc...",2020-03-01,10.1007/s10531-020-01938-2,10.5285/bb15e200-9349-403c-bda9-b430093807c7,"Land Cover Map 2015 (25m raster, GB)",Environmental Information Data Centre (EIDC),journal-article
4,11,IsReferencedBy,Design flood estimation and utility of high‐re...,2019-06-01,10.1111/jfr3.12464,10.5285/bb15e200-9349-403c-bda9-b430093807c7,"Land Cover Map 2015 (25m raster, GB)",Environmental Information Data Centre (EIDC),journal-article
...,...,...,...,...,...,...,...,...,...
1019,1,IsReferencedBy,Borehole thermometry data from Rutford Ice Str...,2020-06-26,10.5285/1fd6ab5a-3139-4a12-99d3-2ba0c0213744,10.5285/dac20505-a56e-4beb-97ba-077eecd587c0,Long-duration GPS record from Rutford Ice Stre...,Polar Data Centre (PDC),dataset
1020,1,IsReferencedBy,Long-duration GPS record from Rutford Ice Stre...,2020-05-05,10.5285/dac20505-a56e-4beb-97ba-077eecd587c0,10.5285/1fd6ab5a-3139-4a12-99d3-2ba0c0213744,Borehole thermometry data from Rutford Ice Str...,Polar Data Centre (PDC),dataset
1021,1,IsReferencedBy,Distribution of mesopelagic fish in the Scotia...,2021-03-09,10.5285/f4dfc0ee-4f61-47c5-a5a8-238e02ff2fdd,10.5285/8e59f849-5b93-438e-a5e0-3c65636f9053,Occurrence records of ten Southern Ocean mycto...,Polar Data Centre (PDC),dataset
1022,1,IsReferencedBy,The Birmingham Urban Climate Laboratory-A high...,2016-06-07,10.1038/sdata.2016.38,10.5285/48316483-edfe-4009-9898-da41bf3023bd,HiTemp: High Density Temperature measurements ...,Centre for Environmental Data Analysis (CEDA),journal-article


### Get the citation string (APA format) of the publication that has cited the dataset

In [16]:
# TAKES A LONG TIME - ~90-120 MINS - can skip this if you don't want this info
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in scholex_df['pubID']:
    
    r = requests.get(('https://doi.org/' + pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa"})
    #print(r.status_code)
    citationStrList.append(r.text) # add the citation strings to the list
    
scholex_df['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

Output spreadsheets

In [111]:
from datetime import date
today = date.today()

scholex_filename = 'dataset_citation_publication_info_' + (today.strftime("%d%m%Y")) + '.csv'
scholex_df.to_csv(scholex_filename, index = False)
print(scholex_filename)

dataset_filename = 'dataset_citation_counts_' + (today.strftime("%d%m%Y")) + '.csv'
dataCite_df.to_csv(dataset_filename, index = False)
print(dataset_filename)

dataset_citation_publication_info_23112022.csv
dataset_citation_counts_23112022.csv


Some code to see different parts of the tables

In [5]:
scholex_df.head(n=10) # see the top n rows

Unnamed: 0,citations,relationshipType,pubTitle,pubDate,pubAuthors,pubID,datasetDOI,datasetTitle,datasetAuthors,datasetPublisher
0,1,IsReferencedBy,High-resolution bathymetry model for the Kolym...,01/01/2021,"[{'Name': 'Fuchs, Matthias', 'Identifier': []}...",10.1594/pangaea.934049,10.5285/c10a2798-40cc-7648-e053-6c86abc07c3c,Kolyma river and near shore CTD measurements t...,"[{'name': 'Palmtag, Juri', 'nameType': 'Person...",British Oceanographic Data Centre (BODC)
1,1,IsReferencedBy,Providing information on environmental change:...,01/09/2016,"[{'Name': 'S. Rennie', 'Identifier': []}]",10.1016/j.ecolind.2016.01.060,10.5285/ca2b1766-2acc-4eeb-82d1-29970e9a5667,UK Environmental Change Network (ECN) frog dat...,"[{'name': 'Rennie, S.', 'nameType': 'Personal'...",Environmental Information Data Centre (EIDC)
2,4,IsReferencedBy,Unprecedented Fe delivery from the Congo River...,28/01/2020,[],10.1038/s41467-019-14255-2,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC)
3,4,IsReferencedBy,Swiss-Polar-Institute/science-data-utils v0.4.0,09/04/2020,"[{'Name': 'Thomas, Jenny', 'Identifier': []}, ...",10.5281/zenodo.3268427,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC)
4,4,IsReferencedBy,Advancing global storm surge modelling using t...,06/01/2020,"[{'Name': 'Dullaart, Job C. M.', 'Identifier':...",10.1007/s00382-019-05044-0,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC)
5,4,IsReferencedBy,Swiss-Polar-Institute/science-data-utils v0.4.0,09/04/2020,"[{'Name': 'Thomas, Jenny', 'Identifier': []}, ...",10.5281/zenodo.3746110,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC)
6,9,IsReferencedBy,Quantification of Information Exchange in Idea...,08/11/2019,[],10.3390/e21111094,10.5285/58a8802721c94c66ae45c3baa4d814d0,CRU TS4.01: Climatic Research Unit (CRU) Time-...,[{'name': 'University Of East Anglia Climatic ...,Centre for Environmental Data Analysis (CEDA)
7,9,IsReferencedBy,The response of vegetation to rising CO 2 conc...,18/04/2018,"[{'Name': 'Hong, Tao', 'Identifier': []}, {'Na...",10.1007/s00704-018-2476-7,10.5285/58a8802721c94c66ae45c3baa4d814d0,CRU TS4.01: Climatic Research Unit (CRU) Time-...,[{'name': 'University Of East Anglia Climatic ...,Centre for Environmental Data Analysis (CEDA)
8,9,IsReferencedBy,Variation in Climate Signals in Teak Tree-Ring...,14/12/2018,"[{'Name': 'Preechamart, Sineenart', 'Identifie...",10.3390/f9120772,10.5285/58a8802721c94c66ae45c3baa4d814d0,CRU TS4.01: Climatic Research Unit (CRU) Time-...,[{'name': 'University Of East Anglia Climatic ...,Centre for Environmental Data Analysis (CEDA)
9,9,IsReferencedBy,Distinguishing interannual variations and poss...,31/05/2019,"[{'Name': 'Xue, Xu', 'Identifier': []}, {'Name...",10.1007/s00382-019-04837-7,10.5285/58a8802721c94c66ae45c3baa4d814d0,CRU TS4.01: Climatic Research Unit (CRU) Time-...,[{'name': 'University Of East Anglia Climatic ...,Centre for Environmental Data Analysis (CEDA)


In [19]:
dataCite_df.head(n=10) # see the top n rows

Unnamed: 0,publisher,title,citations,datasetDOI_attribute,dates,page_number,Page endpoint
0,National Geoscience Data Centre (NGDC),"Cardiff Urban Geo-Observatory, Groundwater Tem...",0,10.5285/bf150dd6-7b28-49ca-b66f-8b543a33a5c0,"[{'date': '2019-09-21', 'dateType': 'Available...",1,https://api.datacite.org/dois?page=1&client-id...
1,Environmental Information Data Centre (EIDC),Land Classification of Cumbria 1975,0,10.5285/0ac6249c-a6f2-4147-8ae9-50d576e85fc5,"[{'date': '2015-12-03', 'dateType': 'Submitted...",1,https://api.datacite.org/dois?page=1&client-id...
2,Environmental Information Data Centre (EIDC),CEH Land Cover plus: Fertilisers 2010-2015 (En...,0,10.5285/15f415db-e87b-4ab5-a2fb-37a78e7bf051,"[{'date': '2019-02-04', 'dateType': 'Submitted...",1,https://api.datacite.org/dois?page=1&client-id...
3,Centre for Environmental Data Analysis (CEDA),CHESS-SCAPE: Future projections of meteorologi...,0,10.5285/8194b416cbee482b89e0dfbe17c5786c,"[{'date': '2022-03-28', 'dateType': 'Updated'}...",1,https://api.datacite.org/dois?page=1&client-id...
4,British Oceanographic Data Centre (BODC),The GEOTRACES Intermediate Data Product 2021 (...,0,10.5285/cf2d9ba9-d51d-3b7c-e053-8486abc0f5fd,"[{'date': '2005-08-08/2019-06-13', 'dateType':...",1,https://api.datacite.org/dois?page=1&client-id...
5,Environmental Information Data Centre (EIDC),Flower-insect timed count data from the UK Pol...,0,10.5285/61b7df6e-4e27-460a-84a5-c100f0dc919f,"[{'date': '2022-08-19', 'dateType': 'Submitted...",1,https://api.datacite.org/dois?page=1&client-id...
6,Environmental Information Data Centre (EIDC),Flower-insect timed count data from the UK Pol...,0,10.5285/13aed7ac-334f-4bb7-b476-4f1c3da45a13,"[{'date': '2022-11-01', 'dateType': 'Submitted...",1,https://api.datacite.org/dois?page=1&client-id...
7,Environmental Information Data Centre (EIDC),Pan-trap survey data from the UK Pollinator Mo...,0,10.5285/06cc6b8f-9bd4-4ae4-af5d-65bcbd319e9f,"[{'date': '2022-08-19', 'dateType': 'Submitted...",1,https://api.datacite.org/dois?page=1&client-id...
8,British Oceanographic Data Centre (BODC),Multi-generational responses of an elevated pC...,0,10.5285/22b54764-2448-1318-e053-6c86abc01ae1,"[{'date': '2012-12-01/2013-06-30', 'dateType':...",1,https://api.datacite.org/dois?page=1&client-id...
9,Centre for Environmental Data Analysis (CEDA),ESA Sea Ice Climate Change Initiative (Sea_Ice...,0,10.5285/ff79d140824f42dd92b204b4f1e9e7c2,"[{'date': '2018-05-25', 'dateType': 'Updated'}...",1,https://api.datacite.org/dois?page=1&client-id...


In [23]:
scholex_df.loc[[4]] # check any row

Unnamed: 0,citations,relationshipType,pubTitle,pubDate,pubID,datasetDOI,datasetTitle,datasetPublisher,PubType
4,2,IsReferencedBy,Acoustic backscatter data and RMT25 abundance ...,2019-04-17,10.5285/e15d622c-5c7e-45e9-b127-f27def94bbe8,10.5285/fad1e73b-b7cb-4728-a8f9-2e3cce5aef90,Water column acoustic data collected from 2016...,Polar Data Centre (PDC),dataset


In [24]:
dataCite_df.loc[[5]] # check any row

Unnamed: 0,publisher,title,citations,datasetDOI_attribute,dates,page_number,Page endpoint
5,Centre for Environmental Data Analysis (CEDA),HadUK-Grid Gridded Climate Observations on a 6...,0,10.5285/6f4ac352b19341eb8c5b26644845ac35,"[{'date': '2022-05-26', 'dateType': 'Updated'}...",1,https://api.datacite.org/dois?page=1&client-id...


In [None]:
dataCite_df.sort_values(by=['citations'], ascending=False)

In [None]:
dataCite_df[dataCite_df['citations'] > 10]

Code to process the author names

In [106]:
scholex_df['datasetAuthors'][0][0]['name']

'Palmtag, Juri'

In [107]:
datasetAuthors_processed = []
for authorList in scholex_df['datasetAuthors']:
    datasetAuthorList = []
    for individual in authorList:
        name = individual['name']
        datasetAuthorList.append(name)
    datasetAuthors_processed.append(datasetAuthorList)

In [109]:
scholex_df['datasetAuthors_processed'] = datasetAuthors_processed

In [98]:
scholex_df['pubAuthors'][0][0]

{'Name': 'Fuchs, Matthias', 'Identifier': []}

In [99]:
pubAuthors_processed = []
for authorList in scholex_df['pubAuthors']:
    pubAuthorList = []
    for individual in authorList:
        name = individual['Name']
        pubAuthorList.append(name)
    pubAuthors_processed.append(pubAuthorList)

In [101]:
scholex_df['pubAuthors_processed'] = pubAuthors_processed

In [102]:
scholex_df

Unnamed: 0,citations,relationshipType,pubTitle,pubDate,pubAuthors,pubID,datasetDOI,datasetTitle,datasetAuthors,datasetPublisher,pubAuthors_processed
0,1,IsReferencedBy,High-resolution bathymetry model for the Kolym...,2021-01-01,"[{'Name': 'Fuchs, Matthias', 'Identifier': []}...",10.1594/pangaea.934049,10.5285/c10a2798-40cc-7648-e053-6c86abc07c3c,Kolyma river and near shore CTD measurements t...,"[{'name': 'Palmtag, Juri', 'nameType': 'Person...",British Oceanographic Data Centre (BODC),"[Fuchs, Matthias, Palmtag, Juri, Juhls, Bennet..."
1,1,IsReferencedBy,Providing information on environmental change:...,2016-09-01,"[{'Name': 'S. Rennie', 'Identifier': []}]",10.1016/j.ecolind.2016.01.060,10.5285/ca2b1766-2acc-4eeb-82d1-29970e9a5667,UK Environmental Change Network (ECN) frog dat...,"[{'name': 'Rennie, S.', 'nameType': 'Personal'...",Environmental Information Data Centre (EIDC),[S. Rennie]
2,4,IsReferencedBy,Unprecedented Fe delivery from the Congo River...,2020-01-28,[],10.1038/s41467-019-14255-2,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC),[]
3,4,IsReferencedBy,Swiss-Polar-Institute/science-data-utils v0.4.0,2020-04-09,"[{'Name': 'Thomas, Jenny', 'Identifier': []}, ...",10.5281/zenodo.3268427,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC),"[Thomas, Jenny, Pina Estany, Carles]"
4,4,IsReferencedBy,Advancing global storm surge modelling using t...,2020-01-06,"[{'Name': 'Dullaart, Job C. M.', 'Identifier':...",10.1007/s00382-019-05044-0,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC),"[Dullaart, Job C. M., Muis, Sanne, Bloemendaal..."
...,...,...,...,...,...,...,...,...,...,...,...
1012,1,IsReferencedBy,Borehole thermometry data from Rutford Ice Str...,2020-06-26,"[{'Name': 'Smith, Andrew', 'Identifier': []}, ...",10.5285/1fd6ab5a-3139-4a12-99d3-2ba0c0213744,10.5285/dac20505-a56e-4beb-97ba-077eecd587c0,Long-duration GPS record from Rutford Ice Stre...,"[{'name': 'Smith, Andrew', 'givenName': 'Andre...",Polar Data Centre (PDC),"[Smith, Andrew, Makinson, Keith, Nicholls, Keith]"
1013,1,IsReferencedBy,Long-duration GPS record from Rutford Ice Stre...,2020-05-05,"[{'Name': 'Smith, Andrew', 'Identifier': []}, ...",10.5285/dac20505-a56e-4beb-97ba-077eecd587c0,10.5285/1fd6ab5a-3139-4a12-99d3-2ba0c0213744,Borehole thermometry data from Rutford Ice Str...,"[{'name': 'Smith, Andrew', 'givenName': 'Andre...",Polar Data Centre (PDC),"[Smith, Andrew, Murray, Tavi, King, Matt]"
1014,1,IsReferencedBy,Distribution of mesopelagic fish in the Scotia...,2021-03-09,"[{'Name': 'Collins, Martin', 'Identifier': []}...",10.5285/f4dfc0ee-4f61-47c5-a5a8-238e02ff2fdd,10.5285/8e59f849-5b93-438e-a5e0-3c65636f9053,Occurrence records of ten Southern Ocean mycto...,"[{'name': 'Freer, Jennifer', 'givenName': 'Jen...",Polar Data Centre (PDC),"[Collins, Martin, Piatkowski, Uwe, Saunders, R..."
1015,1,IsReferencedBy,The Birmingham Urban Climate Laboratory-A high...,2016-06-07,[],10.1038/sdata.2016.38,10.5285/48316483-edfe-4009-9898-da41bf3023bd,HiTemp: High Density Temperature measurements ...,"[{'name': 'Warren, E. L.', 'nameType': 'Person...",Centre for Environmental Data Analysis (CEDA),[]


In [110]:
scholex_df

Unnamed: 0,citations,relationshipType,pubTitle,pubDate,pubAuthors,pubID,datasetDOI,datasetTitle,datasetAuthors,datasetPublisher,pubAuthors_processed,datasetAuthors_processed
0,1,IsReferencedBy,High-resolution bathymetry model for the Kolym...,2021-01-01,"[{'Name': 'Fuchs, Matthias', 'Identifier': []}...",10.1594/pangaea.934049,10.5285/c10a2798-40cc-7648-e053-6c86abc07c3c,Kolyma river and near shore CTD measurements t...,"[{'name': 'Palmtag, Juri', 'nameType': 'Person...",British Oceanographic Data Centre (BODC),"[Fuchs, Matthias, Palmtag, Juri, Juhls, Bennet...","[Palmtag, Juri, Mann, Paul J]"
1,1,IsReferencedBy,Providing information on environmental change:...,2016-09-01,"[{'Name': 'S. Rennie', 'Identifier': []}]",10.1016/j.ecolind.2016.01.060,10.5285/ca2b1766-2acc-4eeb-82d1-29970e9a5667,UK Environmental Change Network (ECN) frog dat...,"[{'name': 'Rennie, S.', 'nameType': 'Personal'...",Environmental Information Data Centre (EIDC),[S. Rennie],"[Rennie, S., Adamson, J., Anderson, R., Andrew..."
2,4,IsReferencedBy,Unprecedented Fe delivery from the Congo River...,2020-01-28,[],10.1038/s41467-019-14255-2,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC),[],[GEBCO Bathymetric Compilation Group 2019]
3,4,IsReferencedBy,Swiss-Polar-Institute/science-data-utils v0.4.0,2020-04-09,"[{'Name': 'Thomas, Jenny', 'Identifier': []}, ...",10.5281/zenodo.3268427,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC),"[Thomas, Jenny, Pina Estany, Carles]",[GEBCO Bathymetric Compilation Group 2019]
4,4,IsReferencedBy,Advancing global storm surge modelling using t...,2020-01-06,"[{'Name': 'Dullaart, Job C. M.', 'Identifier':...",10.1007/s00382-019-05044-0,10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,[{'name': 'GEBCO Bathymetric Compilation Group...,British Oceanographic Data Centre (BODC),"[Dullaart, Job C. M., Muis, Sanne, Bloemendaal...",[GEBCO Bathymetric Compilation Group 2019]
...,...,...,...,...,...,...,...,...,...,...,...,...
1012,1,IsReferencedBy,Borehole thermometry data from Rutford Ice Str...,2020-06-26,"[{'Name': 'Smith, Andrew', 'Identifier': []}, ...",10.5285/1fd6ab5a-3139-4a12-99d3-2ba0c0213744,10.5285/dac20505-a56e-4beb-97ba-077eecd587c0,Long-duration GPS record from Rutford Ice Stre...,"[{'name': 'Smith, Andrew', 'givenName': 'Andre...",Polar Data Centre (PDC),"[Smith, Andrew, Makinson, Keith, Nicholls, Keith]","[Smith, Andrew, Murray, Tavi, King, Matt]"
1013,1,IsReferencedBy,Long-duration GPS record from Rutford Ice Stre...,2020-05-05,"[{'Name': 'Smith, Andrew', 'Identifier': []}, ...",10.5285/dac20505-a56e-4beb-97ba-077eecd587c0,10.5285/1fd6ab5a-3139-4a12-99d3-2ba0c0213744,Borehole thermometry data from Rutford Ice Str...,"[{'name': 'Smith, Andrew', 'givenName': 'Andre...",Polar Data Centre (PDC),"[Smith, Andrew, Murray, Tavi, King, Matt]","[Smith, Andrew, Makinson, Keith, Nicholls, Keith]"
1014,1,IsReferencedBy,Distribution of mesopelagic fish in the Scotia...,2021-03-09,"[{'Name': 'Collins, Martin', 'Identifier': []}...",10.5285/f4dfc0ee-4f61-47c5-a5a8-238e02ff2fdd,10.5285/8e59f849-5b93-438e-a5e0-3c65636f9053,Occurrence records of ten Southern Ocean mycto...,"[{'name': 'Freer, Jennifer', 'givenName': 'Jen...",Polar Data Centre (PDC),"[Collins, Martin, Piatkowski, Uwe, Saunders, R...","[Freer, Jennifer]"
1015,1,IsReferencedBy,The Birmingham Urban Climate Laboratory-A high...,2016-06-07,[],10.1038/sdata.2016.38,10.5285/48316483-edfe-4009-9898-da41bf3023bd,HiTemp: High Density Temperature measurements ...,"[{'name': 'Warren, E. L.', 'nameType': 'Person...",Centre for Environmental Data Analysis (CEDA),[],"[Warren, E. L., Chapman, L., Muller, C. T., Yo..."
