### NERC dataset citations - part 1
Code to collect NERC dataset citations from Scholix, CrossRef and DataCite APIs
Produces a csv for each database

In [1]:
import requests, time, json, re, datetime, os, sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import numpy as np
import pandas as pd
from math import ceil
from datetime import date
from scholix_fun import getNERCDataDOIs, getScholixDatasetCitations, process_citation_results, getPublicationType, countScholixCitations, getCitationString
import crossRef_fun
from crossRef_fun import getDataCiteInfo, getCrossRefCitations, filterCrossRefResults, mergeDFs, getPublicationInfo
from dataCite_fun import getDataCiteCitations_relationTypes, getPublicationInfo_timeCopy
# import exportCitationResultsToCsv
# from Results import convertCSVtoJSON


sys.path.insert(0, '..')


### Get the dataset DOIs

In [2]:
# this takes approx ~20 mins
from functions import getNERCDataDOIs_new
dataCite_df_new = getNERCDataDOIs_new.getNERCDataDOIs()

KeyboardInterrupt: 

## DataCite

In [None]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to']
dataCite_df_relationTypes = getDataCiteCitations_relationTypes.getDataCiteCitations_relationTypes(relation_type_id_list)

In [None]:
# remove http from DOI url
doi_list = []
for url in dataCite_df_relationTypes['subj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['data_doi'] = doi_list
dataCite_df_relationTypes = dataCite_df_relationTypes.drop(['subj-id'], axis=1)

# doi_list = []
# for url in dataCite_df_relationTypes['obj-id']:
#     doi = url.replace('https://doi.org/','')
#     doi_list.append(doi)
# dataCite_df_relationTypes['pub_doi'] = doi_list # rename to work in getPublicationInfo function 
    
dataCite_df_relationTypes = dataCite_df_relationTypes.rename(columns={"obj-id": "pub_doi_url"})

In [None]:
# drop the rows where the data_doi column value does not start with "10.5285"
dataCite_df_relationTypes = dataCite_df_relationTypes[dataCite_df_relationTypes['data_doi'].str.startswith('10.5285')]

In [None]:
# get dataset metadata 
info_list = []
headers = {'client-id': 'bl.nerc'}
api_url = 'https://api.datacite.org/dois/' 
# for doi in dataCite_df_relationTypes['data_doi']:
for (source_id, relation_type_id, occurred_at, Page_endpoint, data_doi, pub_doi_url) in zip(dataCite_df_relationTypes['source-id'],dataCite_df_relationTypes['relation-type-id'],dataCite_df_relationTypes['occurred-at'],dataCite_df_relationTypes['Page endpoint'],dataCite_df_relationTypes['data_doi'], dataCite_df_relationTypes['pub_doi_url']):
    r = requests.get((api_url + data_doi), headers)
    print(r.status_code, data_doi)
    
    try:
        # process author info
        author_list = []
        for item in r.json()['data']['attributes']['creators']:
            author_list.append(item['name'])

        info_list.append([
            r.json()['data']['attributes']['publisher'],
            data_doi,
            r.json()['data']['attributes']['titles'][0]['title'],
            author_list,
            r.json()['data']['attributes']['publicationYear'],
            r.json()['data']['attributes']['dates'],
            r.json()['data']['attributes']['registered'],
            source_id, relation_type_id, pub_doi_url, occurred_at, Page_endpoint
        ])
    except Exception as e:
        info_list.append(["error",data_doi,"error","error","error","error","error","error","error",pub_doi_url,"error","error"])
        
columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered', 
           'source-id', 'relation-type-id', 'pub_doi_url', 'occurred-at', 'Page endpoint']
dataCite_df = pd.DataFrame(info_list, columns = columns)    
print("Done!")
    

In [None]:
# get publication info - takes a long time
dataCite_df_temp = dataCite_df.rename(columns={"pub_doi_url": "subj_id"})
dataCite_df_publication_meta = getPublicationInfo_timeCopy.getPublicationInfo(dataCite_df_temp)

In [None]:
doi_list = []
for url in dataCite_df_publication_meta['subj_id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_publication_meta['pub_doi'] = doi_list # rename to work in getPublicationInfo function 

In [None]:
#process dataset publisher names
newPublisherLst = []
for dataCentreName in dataCite_df_publication_meta['data_publisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass

    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
dataCite_df_publication_meta['publisher_processed'] = newPublisherLst

dataCite_df_publication_meta = dataCite_df_publication_meta.drop(['data_publisher'], axis=1)
dataCite_df_publication_meta = dataCite_df_publication_meta.rename(columns={'publisher_processed':'data_publisher'})

## Scholix

### Pass the dataset DOIs to the scholex API to get the citations and their respective DOIs

In [None]:
# this takes about 8 mins
scholex_df = getScholixDatasetCitations.getScholixDatasetCitations(dataCite_df)

#### Process the citation results

In [None]:
dataCite_df, scholex_df = process_citation_results.process_citation_results(dataCite_df, scholex_df)

In [None]:
# filter out gbif registrant code prefix 10.15468
# scholex_df = scholex_df[~scholex_df.pubID.str.contains("10.15468")]
scholex_df = scholex_df[~scholex_df['pubID'].apply(lambda x: str(x)).str.contains("10.15468")]


### Check the DOIs at DOI.org to determine the type of publication and to check there are no duplicates (by preprints etc) 

In [None]:
# very long 3+ hours
scholex_df = getPublicationType.getPublicationType(scholex_df)

### Output spreadsheet

In [None]:
today = date.today()

scholex_filename = "Results/Intermediate data/" + 'scholix_citation_publication_info_' + (today.strftime("%d%m%Y")) + '.csv'
scholex_df.to_csv(scholex_filename, index = False)
scholex_df.to_csv("Results/Intermediate data/latest_results_scholix.csv", index = False)

## CrossRef

In [None]:
# Code for getting results based on a date range

email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "1990-01-01"
today = date.today()
end_date = today.strftime("%Y-%m-%d")
results_folder_path = "Results/"
results_folder_path_name = results_folder_path + "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date

getCrossRefCitations.getCrossRefCitations_byDates(email, prefix, start_date, end_date, results_folder_path)

In [None]:
# filter results 
crossRef_df_gbif_filtered2_deduplicated = filterCrossRefResults.filterCrossRefResults(results_folder_path_name)

In [None]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors, dataCite_df) = getDataCiteInfo.getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

In [None]:
(dataset_df, crossRef_df_gbif_filtered2_deduplicated) = mergeDFs.mergeDFs(dataCite_df,crossRef_df_gbif_filtered2_deduplicated)

In [None]:
crossRef_df_gbif_filtered2_deduplicated = getPublicationInfo.getPublicationInfo(crossRef_df_gbif_filtered2_deduplicated)

### Output spreadsheet

In [None]:
crossRef_df_processed_filename = "Results/Intermediate data/" + 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)

with open("Results/Intermediate data/latest_results_crossRef.csv", 'w') as f:
    crossRef_df_gbif_filtered2_deduplicated.to_csv("Results/Intermediate data/latest_results_crossRef.csv",index = False)

### Output spreadsheet

In [None]:
today = date.today()
dataCite_filename = "Results/Intermediate data/" + 'dataCite_df_events_publication_meta_retrieved_' + (today.strftime("%d%m%Y")) + '.csv'
dataCite_df_publication_meta.to_csv(dataCite_filename, index = False)
dataCite_df_publication_meta.to_csv("Results/Intermediate data/latest_results_dataCite.csv", index = False)
