### NERC dataset citations
Code to collect NERC dataset citations from Scholix, CrossRef and DataCite APIs, process and merge the results.
Produces a csv and json with details of the citations for NERC published datasets.

In [2]:
import requests, time, json, re, datetime, os, sys
import numpy as np
import pandas as pd
from math import ceil
from datetime import date
from scholix_fun import getNERCDataDOIs, getScholixDatasetCitations, process_citation_results, getPublicationType, countScholixCitations, getCitationString
import crossRef_fun
from crossRef_fun import getDataCiteInfo, getCrossRefCitations, filterCrossRefResults, mergeDFs, getPublicationInfo
from dataCite_fun import getDataCiteCitations_relationTypes, getPublicationInfo_timeCopy
# import exportCitationResultsToCsv
# from Results import convertCSVtoJSON


sys.path.insert(0, '..')


## Scholix

### Get the dataset DOIs

In [None]:
# this takes approx ~20 mins
# will print output as it goes along to see progress - in JupyterLab right click on the output and select enable scrolling for outputs
dataCite_df = getNERCDataDOIs.getNERCDataDOIs()

### Pass the dataset DOIs to the scholex API to get the citations and their respective DOIs

In [None]:
# this takes about 8 mins
# prints output as it goes along to see progress - in JupyterLab right click on the output and select enable scrolling for outputs
scholex_df = getScholixDatasetCitations.getScholixDatasetCitations(dataCite_df)

#### Process the citation results

In [None]:
dataCite_df, scholex_df = process_citation_results.process_citation_results(dataCite_df, scholex_df)

In [None]:
# filter out gbif registrant code prefix 10.15468
# scholex_df = scholex_df[~scholex_df.pubID.str.contains("10.15468")]
scholex_df = scholex_df[~scholex_df['pubID'].apply(lambda x: str(x)).str.contains("10.15468")]


### Check the DOIs at DOI.org to determine the type of publication and to check there are no duplicates (by preprints etc) 

In [None]:
# very long 3+ hours
scholex_df = getPublicationType.getPublicationType(scholex_df)

### Output spreadsheet

In [None]:
today = date.today()

scholex_filename = "Results/Intermediate data/" + 'scholix_citation_publication_info_' + (today.strftime("%d%m%Y")) + '.csv'
scholex_df.to_csv(scholex_filename, index = False)

## CrossRef

In [None]:
# Code for getting results based on a date range

email = "matnic@ceh.ac.uk"
prefix = "10.5285"
start_date = "1990-01-01"
today = date.today()
end_date = today.strftime("%Y-%m-%d")
results_folder_path = "Results/"
results_folder_path_name = results_folder_path + "NERC_EDS_events_from_" + start_date + "_up_to_" + end_date

getCrossRefCitations.getCrossRefCitations_byDates(email, prefix, start_date, end_date, results_folder_path)

In [None]:
# filter results 
crossRef_df_gbif_filtered2_deduplicated = filterCrossRefResults.filterCrossRefResults(results_folder_path_name)

In [None]:
# Pass citation info to datacite API to collect relevant info on the datasets, data centres etc
(errors, dataCite_df) = getDataCiteInfo.getDataCiteInfo(crossRef_df_gbif_filtered2_deduplicated)

In [None]:
(dataset_df, crossRef_df_gbif_filtered2_deduplicated) = mergeDFs.mergeDFs(dataCite_df,crossRef_df_gbif_filtered2_deduplicated)

In [None]:
crossRef_df_gbif_filtered2_deduplicated = getPublicationInfo.getPublicationInfo(crossRef_df_gbif_filtered2_deduplicated)

### Output spreadsheet

In [None]:
crossRef_df_processed_filename = "Results/Intermediate data/" + 'dataset_citation_publication_info_' + start_date + "_to_" + end_date + "_retrieved_" + (today.strftime("%d%m%Y")) + '.csv'
crossRef_df_gbif_filtered2_deduplicated.to_csv(crossRef_df_processed_filename, index = False)

## DataCite

In [None]:
relation_type_id_list = ['is-cited-by', 'is-referenced-by', 'is-supplement-to']
dataCite_df_relationTypes = getDataCiteCitations_relationTypes.getDataCiteCitations_relationTypes(relation_type_id_list)

In [None]:
# remove http from DOI url
doi_list = []
for url in dataCite_df_relationTypes['subj-id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_relationTypes['data_doi'] = doi_list
dataCite_df_relationTypes = dataCite_df_relationTypes.drop(['subj-id'], axis=1)

# doi_list = []
# for url in dataCite_df_relationTypes['obj-id']:
#     doi = url.replace('https://doi.org/','')
#     doi_list.append(doi)
# dataCite_df_relationTypes['pub_doi'] = doi_list # rename to work in getPublicationInfo function 
    
dataCite_df_relationTypes = dataCite_df_relationTypes.rename(columns={"obj-id": "pub_doi_url"})

In [None]:
# drop the rows where the data_doi column value does not start with "10.5285"
dataCite_df_relationTypes = dataCite_df_relationTypes[dataCite_df_relationTypes['data_doi'].str.startswith('10.5285')]

In [None]:
# get dataset metadata 
info_list = []
headers = {'client-id': 'bl.nerc'}
api_url = 'https://api.datacite.org/dois/' 
# for doi in dataCite_df_relationTypes['data_doi']:
for (source_id, relation_type_id, occurred_at, Page_endpoint, data_doi, pub_doi_url) in zip(dataCite_df_relationTypes['source-id'],dataCite_df_relationTypes['relation-type-id'],dataCite_df_relationTypes['occurred-at'],dataCite_df_relationTypes['Page endpoint'],dataCite_df_relationTypes['data_doi'], dataCite_df_relationTypes['pub_doi_url']):
    r = requests.get((api_url + data_doi), headers)
    print(r.status_code, data_doi)
    
    try:
        # process author info
        author_list = []
        for item in r.json()['data']['attributes']['creators']:
            author_list.append(item['name'])

        info_list.append([
            r.json()['data']['attributes']['publisher'],
            data_doi,
            r.json()['data']['attributes']['titles'][0]['title'],
            author_list,
            r.json()['data']['attributes']['publicationYear'],
            r.json()['data']['attributes']['dates'],
            r.json()['data']['attributes']['registered'],
            source_id, relation_type_id, pub_doi_url, occurred_at, Page_endpoint
        ])
    except Exception as e:
        info_list.append(["error",data_doi,"error","error","error","error","error","error","error",pub_doi_url,"error","error"])
        
columns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'publicationYear', 'dates', 'registered', 
           'source-id', 'relation-type-id', 'pub_doi_url', 'occurred-at', 'Page endpoint']
dataCite_df = pd.DataFrame(info_list, columns = columns)    
print("Done!")
    

In [None]:
# get publication info - takes a long time
dataCite_df_temp = dataCite_df.rename(columns={"pub_doi_url": "subj_id"})
dataCite_df_publication_meta = getPublicationInfo_timeCopy.getPublicationInfo(dataCite_df_temp)

In [None]:
doi_list = []
for url in dataCite_df_publication_meta['subj_id']:
    doi = url.replace('https://doi.org/','')
    doi_list.append(doi)
dataCite_df_publication_meta['pub_doi'] = doi_list # rename to work in getPublicationInfo function 

In [None]:
#process dataset publisher names
newPublisherLst = []
for dataCentreName in dataCite_df_publication_meta['data_publisher']:
    if type(dataCentreName) == float or dataCentreName is None:
        newPublisherLst.append(dataCentreName)
        continue
    else:
        pass

    dataCentreName_lower = dataCentreName.lower() # make it all lowercase as 'in' operator used below is case sensitive
    if 'polar' in dataCentreName_lower:
        newPublisherLst.append('Polar Data Centre (PDC)')
    elif 'atmospheric' in dataCentreName_lower or 'badc' in dataCentreName_lower or 'earth' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    elif 'oceanographic' in dataCentreName_lower:
        newPublisherLst.append('British Oceanographic Data Centre (BODC)')
    elif 'geological' in dataCentreName_lower or 'geoscience' in dataCentreName_lower:
        newPublisherLst.append('National Geoscience Data Centre (NGDC)')
    elif 'environmental information' in dataCentreName_lower:
        newPublisherLst.append('Environmental Information Data Centre (EIDC)')
    elif 'environmental data' in dataCentreName_lower:
        newPublisherLst.append('Centre for Environmental Data Analysis (CEDA)')
    else:
        newPublisherLst.append(dataCentreName)
dataCite_df_publication_meta['publisher_processed'] = newPublisherLst

dataCite_df_publication_meta = dataCite_df_publication_meta.drop(['data_publisher'], axis=1)
dataCite_df_publication_meta = dataCite_df_publication_meta.rename(columns={'publisher_processed':'data_publisher'})

### Output spreadsheet

In [None]:
dataCite_filename = "Results/Intermediate data/" + 'dataCite_df_events_publication_meta_retrieved_' + (today.strftime("%d%m%Y")) + '.csv'
dataCite_df_publication_meta.to_csv(dataCite_filename, index = False)

# Merge results

In [1]:
dataCite_filename = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/Intermediate data/dataCite_df_events_publication_meta_retrieved_19102023.csv"
scholex_filename = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/Intermediate data/scholix_citation_publication_info_19102023.csv"
crossRef_df_processed_filename = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/Intermediate data/dataset_citation_publication_info_1990-01-01_to_2023-10-19_retrieved_19102023.csv"

In [3]:
scholex_df = pd.read_csv(scholex_filename)
crossref_df = pd.read_csv(crossRef_df_processed_filename)
datacite_df = pd.read_csv(dataCite_filename)


In [None]:
datacite_df

In [4]:
## Remove https bits from crossref df DOIs
# remove url bit from subj_id
crossref_doi_list = []
for url in crossref_df['subj_id']:
    doi = url.replace('https://doi.org/','')
    crossref_doi_list.append(doi)
crossref_df['subj_doi'] = crossref_doi_list

# remove url bit from 'obj_id'
crossref_doi_list = []
for url in crossref_df['obj_id']:
    temp = url.split('/')
    crossref_doi_list.append(temp[3] + "/" + temp[4])
crossref_df['obj_doi'] = crossref_doi_list

In [5]:
# process the three dataframes make columns match
# columns should be:
# can add event_source, dates, publication_publisher columns later
newColumns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'relation_type_id', 'publication_doi', 'publication_type', 'publication_title', 'publication_authors']

crossref_column_list = [crossref_df['dataset_publisher_processed'],crossref_df['obj_doi'],crossref_df['dataset_Title'],crossref_df['dataset_authors'],crossref_df['relation_type_id'],crossref_df['subj_doi'],crossref_df['subj_work_type_id'],crossref_df['pub_Title'],crossref_df['pub_authors']]
crossref_df_newColumns = pd.concat(crossref_column_list, axis = 1)
crossref_df_newColumns.columns = newColumns

# scholex_column_list = [scholex_df[['datasetPublisher']],scholex_df[['datasetDOI']],scholex_df[['datasetTitle']],scholex_df[['datasetAuthors_processed']],scholex_df[['relationshipType']],scholex_df[['pubID']],scholex_df[['PubType']],scholex_df[['pubTitle']],scholex_df[['pubAuthors_processed']]]
scholex_column_list = [scholex_df['datasetPublisher'],scholex_df['datasetDOI'],scholex_df['datasetTitle'],scholex_df['datasetAuthors'],scholex_df['relationshipType'],scholex_df['pubID'],scholex_df['PubType'],scholex_df['pubTitle'],scholex_df['pubAuthors_processed']]
scholex_df_newColumns = pd.concat(scholex_column_list, axis = 1)
scholex_df_newColumns.columns = newColumns

datacite_column_list = [datacite_df['data_publisher'], datacite_df['data_doi'], datacite_df['data_title'], datacite_df['data_authors'], datacite_df['relation-type-id'], datacite_df['pub_doi'], datacite_df['publisher'], datacite_df['pub_Title'], datacite_df['pub_authors']] 
datacite_df_newColumns = pd.concat(datacite_column_list, axis = 1)
datacite_df_newColumns.columns = newColumns


In [6]:
# create single list of data dois 
scholix_doi_list = list(scholex_df_newColumns['data_doi'])
crossref_doi_list = list(crossref_df_newColumns['data_doi'])
datacite_doi_list = list(datacite_df_newColumns['data_doi'])

data_doi_list = scholix_doi_list + crossref_doi_list + datacite_doi_list

# remove duplicates = convert to dict and back to list again auto removes dups
data_doi_list_unique = list( dict.fromkeys(data_doi_list))

In [7]:
# loop through list of data dois, check pub DOI in each of crossref scholex and datacite dfs - compare result
comparison_dicts = []
data_doi_df = pd.DataFrame(data_doi_list_unique)
for doi in data_doi_df[0]:
    doi = doi.replace(")","") # remove rogue brackets
    scholex_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(doi)].index
    scholex_matches = scholex_df_newColumns['publication_doi'].iloc[scholex_indices].tolist()
    
    crossref_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(doi)].index
    crossref_matches = crossref_df_newColumns['publication_doi'].iloc[crossref_indices].tolist()
    
    datacite_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(doi)].index
    datacite_matches = datacite_df_newColumns['publication_doi'].iloc[datacite_indices].tolist()
    
    combined = scholex_matches + crossref_matches + datacite_matches
    combined_unique = list(dict.fromkeys(combined))
    
    inScholix_notIn_crossRef = list(set(scholex_matches) - set(crossref_matches))
    inCrossRef_notIn_scholix = list(set(crossref_matches) - set(scholex_matches))
    inDatacite_notIn_scholix_or_crossRef = list(set(datacite_matches) - set(scholex_matches) - set(crossref_matches))
    
    comparison_dicts.append({
        'data_doi': doi,
        'combined_unique_dois': combined_unique,
        'scholex_pub_dois': scholex_matches,
        'crossref_pub_dois': crossref_matches,
        'datacite_pub_dois': datacite_matches,
        'inScholix_notIn_crossRef':inScholix_notIn_crossRef,
        'inCrossRef_notIn_scholix':inCrossRef_notIn_scholix,
        'inDatacite_notIn_scholix_or_crossRef':inDatacite_notIn_scholix_or_crossRef
    })


In [8]:
# for combined_unique_dois create a final dataframe getting metadata  from dfs
results = []
for dataset in comparison_dicts:
    for pubdoi in dataset['scholex_pub_dois']:
        
        # find index of this pubdoi datadoi pair in scholex_df
        pub_indices = scholex_df_newColumns[scholex_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("scholex_pub_dois")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': scholex_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': scholex_df_newColumns.iloc[index]['data_title'],
            'data_Authors': scholex_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': scholex_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': scholex_df_newColumns.iloc[index]['publication_type'],
            'publication_title': scholex_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': scholex_df_newColumns.iloc[index]['publication_authors'],
            'citation_event_source': 'Scholix'
            })
  
    
    for pubdoi in dataset['inCrossRef_notIn_scholix']:
        # find index of this pubdoi datadoi pair in crossref_df
        pub_indices = crossref_df_newColumns[crossref_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inCrossRef_notIn_scholix")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': crossref_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': crossref_df_newColumns.iloc[index]['data_title'],
            'data_Authors': crossref_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': crossref_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': crossref_df_newColumns.iloc[index]['publication_type'],
            'publication_title': crossref_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': crossref_df_newColumns.iloc[index]['publication_authors'],
            'citation_event_source': 'CrossRef'
            })
        
        
    for pubdoi in dataset['inDatacite_notIn_scholix_or_crossRef']:
        # find index of this pubdoi datadoi pair in datacite_df_newColumns
        pub_indices = datacite_df_newColumns[datacite_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inDatacite_notIn_scholix_or_crossRef")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': datacite_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': datacite_df_newColumns.iloc[index]['data_title'],
            'data_Authors': datacite_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': datacite_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': datacite_df_newColumns.iloc[index]['publication_type'],
            'publication_title': datacite_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': datacite_df_newColumns.iloc[index]['publication_authors'],
            'citation_event_source': 'DataCite'
            })

scholex_pub_dois
pub_indices:  Int64Index([], dtype='int64') data_indices:  Int64Index([108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
            121, 122],
           dtype='int64')
{
  "data_doi": "10.5285/54757cbe-0b13-4385-8b31-4dfaa1dab55e",
  "combined_unique_dois": [
    "10.5285/b809a040-8305-4bc5-baff-76aa2b823734",
    "10.15446/mag.v35n1.96670",
    "10.34248/bsengineering.1077162",
    "10.21511/ppm.20(1).2022.24",
    "10.18069/firatsbed.1129881",
    "10.33724/zm.1149418",
    "10.36456/embrio.v14i2.4836",
    "10.1007/s11696-022-02174-4",
    "10.32362/2500-316x-2022-10-1-7-17",
    "10.1145/3543434.3543485",
    "10.22209/rhs.v10n1a03",
    "10.2478/ceej-2019-0020",
    "10.35609/afr.2016.1.1(7)",
    "10.47847/fagropec.v14n1a5",
    "10.30692/sisad.1020020"
  ],
  "scholex_pub_dois": [
    "10.5285/b809a040-8305-4bc5-baff-76aa2b823734",
    "10.15446/mag.v35n1.96670",
    "10.34248/bsengineering.1077162",
    "10.21511/ppm.20(1).2022.24",
    "10.180

In [9]:
data_citations = pd.DataFrame.from_dict(results)

### Get the citation string (APA format) of the publication that has cited the dataset

In [None]:
# # TAKES A LONG TIME - hours
# citationStrList = [] # create an empty list in which to put the citation strings

# for pubDOI in data_citations['publication_doi']:
#     if pubDOI.startswith('10.'):
#         r = requests.get(('https://doi.org/' + pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa", "Accept-Charset": "utf-8"})
#         #print(r.status_code)
#         citationStrList.append(r.text) # add the citation strings to the list
#     else:
#         citationStrList.append('not a doi')
    
# data_citations['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [None]:
citationStrList = [] # create an empty list in which to put the citation strings

for pubDOI in data_citations_merged['publication_doi']:
    if pubDOI.startswith('10.'):
        print(pubDOI)
        r = requests.get(("https://citation.crosscite.org/format?style=frontiers-of-biogeography&lang=en-GB&doi=" + pubDOI), headers={"Accept":"text/x-bibliography", "Accept-Charset": "utf-8"})
        print(r.status_code)
        encoded_citation = r.text
        # add the citation strings to the list and Decode the author names assuming UTF-8 encoding
        citationStrList.append(encoded_citation.encode('latin1').decode('utf-8')) 
    else:
        citationStrList.append('not a doi')
        
data_citations['PubCitationStr'] = citationStrList # add the citation string list to df

In [10]:
# extra requested columns
data_citations['data_doi_url'] = 'doi.org/' + data_citations['data_doi']
data_citations['publication_doi_url'] = 'doi.org/' + data_citations['publication_doi']

In [None]:
datacite_df2 = datacite_df[['data_doi', 'publicationYear']]
data_citations_merged = data_citations.merge(datacite_df2, left_on='data_doi', right_on='data_doi', how='left')
data_citations_merged['publicationYear'] = data_citations_merged['publicationYear'].astype('Int64')
data_citations_merged = data_citations_merged.fillna(np.nan).replace([np.nan], [None])

data_citations_merged = data_citations_merged.drop_duplicates(subset=['data_doi', 'publication_doi'])
data_citations_merged
data_citations = data_citations_merged

In [56]:
data_citations

Unnamed: 0,data_Publisher,data_doi,data_Title,data_Authors,relation_type_id,publication_doi,publication_type,publication_title,publication_authors,citation_event_source,data_doi_url,publication_doi_url,publicationYear,PubCitationStr
0,British Oceanographic Data Centre (BODC),10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2019'],IsRelatedTo,10.3389/fmars.2021.652540,journal-article,Discerning the Management-Relevant Ecology and...,"['Rebecca E. Ross', 'Genoveva Gonzalez-Mirelis...",Scholix,doi.org/10.5285/836f016a-33be-6ddc-e053-6c86ab...,doi.org/10.3389/fmars.2021.652540,,"Ross, R.E., Gonzalez-Mirelis, G., Lozano, P. &..."
1,British Oceanographic Data Centre (BODC),10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2019'],IsRelatedTo,10.11646/zootaxa.4963.3.6,journal-article,Deep-sea Ophiuroidea (Echinodermata) from the ...,"['Sabine Stöhr', ""Timothy D. O'Hara""]",Scholix,doi.org/10.5285/836f016a-33be-6ddc-e053-6c86ab...,doi.org/10.11646/zootaxa.4963.3.6,,"STÖHR, S. & O’HARA, T.D. (2021) Deep-sea Ophiu..."
2,British Oceanographic Data Centre (BODC),10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2019'],IsRelatedTo,2117/369225,not a doi,Discriminating the occurrence of inundation in...,"['Jorge Núñez', 'Patricio A. Catalán', 'Carlos...",Scholix,doi.org/10.5285/836f016a-33be-6ddc-e053-6c86ab...,doi.org/2117/369225,,not a doi
3,British Oceanographic Data Centre (BODC),10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2019'],IsRelatedTo,10.1038/s41598-019-56012-x,journal-article,In search for the sources of plastic marine li...,"['Simon Jan van Gennip', 'Martin Thiel', 'Luis...",Scholix,doi.org/10.5285/836f016a-33be-6ddc-e053-6c86ab...,doi.org/10.1038/s41598-019-56012-x,,"Gennip, S.J. van, Dewitte, B., Garçon, V., et ..."
4,British Oceanographic Data Centre (BODC),10.5285/836f016a-33be-6ddc-e053-6c86abc0788e,The GEBCO_2019 Grid - a continuous terrain mod...,['GEBCO Bathymetric Compilation Group 2019'],IsRelatedTo,10.1038/s41561-022-00961-y,journal-article,Relative sea-level data preclude major late Ho...,"['Scott Braddock', 'Brenda L. Hall', 'Joanne S...",Scholix,doi.org/10.5285/836f016a-33be-6ddc-e053-6c86ab...,doi.org/10.1038/s41561-022-00961-y,,"Braddock, S., Hall, B.L., Johnson, J.S., Balco..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21389,Environmental Information Data Centre (EIDC),10.5285/b06a08bc-39e5-4401-87dd-9568fd5048fd,Radiocarbon dating of charcoal pieces from soi...,"['Feldpausch, T.R.', 'Aragão, L.E.O.C.', 'Phil...",is-referenced-by,https://www.frontiersin.org/articles/10.3389/f...,not a doi,not a doi,not a doi,DataCite,doi.org/10.5285/b06a08bc-39e5-4401-87dd-9568fd...,doi.org/https://www.frontiersin.org/articles/1...,2023,not a doi
21391,Environmental Information Data Centre (EIDC),10.5285/63d4b774-4e03-4db2-95ad-dcca18f0d681,Tree diameter census data in intact forest per...,"['Feldpausch, T.R.', 'Aragão, L.E.O.C.', 'Phil...",is-referenced-by,https://forestplots.net/,not a doi,not a doi,not a doi,DataCite,doi.org/10.5285/63d4b774-4e03-4db2-95ad-dcca18...,doi.org/https://forestplots.net/,2023,not a doi
21392,Environmental Information Data Centre (EIDC),10.5285/6410a578-d21a-4285-8e9c-57efbe2b60d5,Soil pyrogenic and organic carbon data in inta...,"['Feldpausch, T.R.', 'Aragão, L.E.O.C.', 'Phil...",is-referenced-by,https://www.frontiersin.org/articles/10.3389/f...,not a doi,not a doi,not a doi,DataCite,doi.org/10.5285/6410a578-d21a-4285-8e9c-57efbe...,doi.org/https://www.frontiersin.org/articles/1...,2023,not a doi
21394,Environmental Information Data Centre (EIDC),10.5285/6410a578-d21a-4285-8e9c-57efbe2b60d5,Soil pyrogenic and organic carbon data in inta...,"['Feldpausch, T.R.', 'Aragão, L.E.O.C.', 'Phil...",is-referenced-by,10.1016/j.geoderma.2017.07.029,Elsevier BV,Amazon Basin forest pyrogenic carbon stocks: F...,"[['Nina', 'Koele'], ['Michael', 'Bird'], ['Jor...",DataCite,doi.org/10.5285/6410a578-d21a-4285-8e9c-57efbe...,doi.org/10.1016/j.geoderma.2017.07.029,2023,"Koele, N., Bird, M., Haig, J., Marimon-Junior,..."


## Output json and csv file

In [None]:
# Output csv file
today = date.today()


results_folder_path = "Results/v2/"
file_name = 'dataCitations_allSourcesMerged_retrieved_' + (today.strftime("%d%m%Y"))

data_citations_csvfilename = results_folder_path + file_name + '.csv'
data_citations.to_csv(data_citations_csvfilename, index = False)
print(data_citations_csvfilename)

# write data to 'latest_results' csv file
latest_file_name = results_folder_path + 'latest_results' + '.csv'
data_citations.to_csv(latest_file_name, index = False)


# write data to 'latest_results' json file with data publisher as top level key
latest_file_name_json = results_folder_path + 'latest_results' + '.json'

# Group by 'data_Publisher' and convert the DataFrame to a nested dictionary
nested_dict = data_citations.groupby('data_Publisher').apply(
    lambda x: x.drop('data_Publisher', axis=1).to_dict(orient='records')
).to_dict()

# Convert the nested dictionary to a JSON object
import json
json_object = json.dumps(nested_dict)

# Save the JSON object to a file
with open(latest_file_name_json, 'w') as f:
    f.write(json_object)
    
data_citations_jsonfilename = results_folder_path + file_name + '.json'

with open(data_citations_jsonfilename, 'w') as f:
    f.write(json_object)


## Extra cells useful for development, not for use in collecting the citation info:

In [None]:
# Read the JSON file into a pandas DataFrame
results_folder_path = "C:/Users/matnic/OneDrive/OneDrive - UKCEH/Projects/DataCentre Citations/Results/v2/"
latest_file_name_json = results_folder_path + 'latest_results' + '.json'

with open(latest_file_name_json, 'r') as f:
    json_data = json.load(f)

# Convert the nested dictionary back into a DataFrame
records = []
for publisher, data_list in json_data.items():
    for data_dict in data_list:
        data_dict['data_Publisher'] = publisher
        records.append(data_dict)

data_citations = pd.DataFrame.from_records(records)

# Display the DataFrame
data_citations


In [None]:
# check pub title info not given results
result = data_citations[data_citations['publication_title'] == 'Info not given']

result

In [None]:
result['citation_event_source'].value_counts()

In [None]:
# from importlib import reload  # Python 3.4+
# reload(getPublicationInfo_timeCopy)

In [None]:
# import pickle
# # Saving the objects:
# with open('objs.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump([dataCite_df, scholex_df], f)

In [None]:
# import pickle
# # Getting back the objects:
# with open('objs.pkl') as f:  # Python 3: open(..., 'rb')
#     dataCite_df, scholex_df = pickle.load(f)