### NERC dataset citations - part 2
Takes results from nerc_dataset_citations_part1.ipynb, 
Process and merge the results.
Produces a csv and json with details of the citations for NERC published datasets.

In [1]:
import requests, time, json, re, datetime, os, sys
import numpy as np
import pandas as pd
from math import ceil
from datetime import date

In [2]:
scholex_df = pd.read_csv("Results/Intermediate data/latest_results_scholix.csv")
crossref_df = pd.read_csv("Results/Intermediate data/latest_results_crossRef.csv")
datacite_df = pd.read_csv("Results/Intermediate data/latest_results_dataCite.csv")

In [3]:
# process publication date columns - scholex is ok 

def process_date(date):
    if date.startswith('[['):
        # If it's a date string, extract the year, month, and day
        parts = date.strip('[]').split(', ')
        year = int(parts[0])
        month = int(parts[1])
        # If day is provided
        if len(parts) == 3:
            day = int(parts[2])
        else:
            # If day is not provided, assume it's the first
            day = 1
        # Format the date as yyyy-mm-dd
        return f"{year}-{month:02d}-{day:02d}"
    else:
        # If it's not a date string (e.g., 'Info not given'), return it as is
        return date

# Apply the function to the 'pub_date' column and create a new column 'pub_date_processed'
crossref_df['pub_date_processed'] = crossref_df['pub_date'].apply(process_date)

datacite_df['pub_date_processed'] = datacite_df['pub_date'].apply(process_date)



In [4]:
## Remove https bits from crossref df DOIs
# remove url bit from subj_id
crossref_doi_list = []
for url in crossref_df['subj_id']:
    doi = url.replace('https://doi.org/','')
    crossref_doi_list.append(doi)
crossref_df['subj_doi'] = crossref_doi_list

# remove url bit from 'obj_id'
crossref_doi_list = []
for url in crossref_df['obj_id']:
    temp = url.split('/')
    crossref_doi_list.append(temp[3] + "/" + temp[4])
crossref_df['obj_doi'] = crossref_doi_list

In [5]:
# process the three dataframes make columns match
# columns should be:
# can add event_source, dates, publication_publisher columns later
newColumns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'relation_type_id', 'publication_doi', 'publication_type', 'publication_title', 'publication_authors', 'publication_date']

crossref_column_list = [crossref_df['dataset_publisher_processed'],crossref_df['obj_doi'],crossref_df['dataset_Title'],crossref_df['dataset_authors'],crossref_df['relation_type_id'],crossref_df['subj_doi'],crossref_df['subj_work_type_id'],crossref_df['pub_Title'],crossref_df['pub_authors'],crossref_df['pub_date_processed']]
crossref_df_newColumns = pd.concat(crossref_column_list, axis = 1)
crossref_df_newColumns.columns = newColumns

# scholex_column_list = [scholex_df[['datasetPublisher']],scholex_df[['datasetDOI']],scholex_df[['datasetTitle']],scholex_df[['datasetAuthors_processed']],scholex_df[['relationshipType']],scholex_df[['pubID']],scholex_df[['PubType']],scholex_df[['pubTitle']],scholex_df[['pubAuthors_processed']]]
scholex_column_list = [scholex_df['datasetPublisher'],scholex_df['datasetDOI'],scholex_df['datasetTitle'],scholex_df['datasetAuthors'],scholex_df['relationshipType'],scholex_df['pubID'],scholex_df['PubType'],scholex_df['pubTitle'],scholex_df['pubAuthors_processed'],scholex_df['pubDate']]
scholex_df_newColumns = pd.concat(scholex_column_list, axis = 1)
scholex_df_newColumns.columns = newColumns

datacite_column_list = [datacite_df['data_publisher'], datacite_df['data_doi'], datacite_df['data_title'], datacite_df['data_authors'], datacite_df['relation-type-id'], datacite_df['pub_doi'], datacite_df['publisher'], datacite_df['pub_Title'], datacite_df['pub_authors'],datacite_df['pub_date_processed']] 
datacite_df_newColumns = pd.concat(datacite_column_list, axis = 1)
datacite_df_newColumns.columns = newColumns


In [6]:
# create single list of data dois 
scholix_doi_list = list(scholex_df_newColumns['data_doi'])
crossref_doi_list = list(crossref_df_newColumns['data_doi'])
datacite_doi_list = list(datacite_df_newColumns['data_doi'])

data_doi_list = scholix_doi_list + crossref_doi_list + datacite_doi_list

# remove duplicates = convert to dict and back to list again auto removes dups
data_doi_list_unique = list( dict.fromkeys(data_doi_list))

In [7]:
# loop through list of data dois, check pub DOI in each of crossref scholex and datacite dfs - compare result
comparison_dicts = []
data_doi_df = pd.DataFrame(data_doi_list_unique)
for doi in data_doi_df[0]:
    doi = doi.replace(")","") # remove rogue brackets
    scholex_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(doi)].index
    scholex_matches = scholex_df_newColumns['publication_doi'].iloc[scholex_indices].tolist()
    
    crossref_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(doi)].index
    crossref_matches = crossref_df_newColumns['publication_doi'].iloc[crossref_indices].tolist()
    
    datacite_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(doi)].index
    datacite_matches = datacite_df_newColumns['publication_doi'].iloc[datacite_indices].tolist()
    
    combined = scholex_matches + crossref_matches + datacite_matches
    combined_unique = list(dict.fromkeys(combined))
    
    inScholix_notIn_crossRef = list(set(scholex_matches) - set(crossref_matches))
    inCrossRef_notIn_scholix = list(set(crossref_matches) - set(scholex_matches))
    inDatacite_notIn_scholix_or_crossRef = list(set(datacite_matches) - set(scholex_matches) - set(crossref_matches))
    
    comparison_dicts.append({
        'data_doi': doi,
        'combined_unique_dois': combined_unique,
        'scholex_pub_dois': scholex_matches,
        'crossref_pub_dois': crossref_matches,
        'datacite_pub_dois': datacite_matches,
        'inScholix_notIn_crossRef':inScholix_notIn_crossRef,
        'inCrossRef_notIn_scholix':inCrossRef_notIn_scholix,
        'inDatacite_notIn_scholix_or_crossRef':inDatacite_notIn_scholix_or_crossRef
    })


In [9]:
# for combined_unique_dois create a final dataframe getting metadata  from dfs
results = []
for dataset in comparison_dicts:
    for pubdoi in dataset['scholex_pub_dois']:
        
        # find index of this pubdoi datadoi pair in scholex_df
        pub_indices = scholex_df_newColumns[scholex_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("scholex_pub_dois")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': scholex_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': scholex_df_newColumns.iloc[index]['data_title'],
            'data_Authors': scholex_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': scholex_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': scholex_df_newColumns.iloc[index]['publication_type'],
            'publication_title': scholex_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': scholex_df_newColumns.iloc[index]['publication_authors'],
            'publication_date': scholex_df_newColumns.iloc[index]['publication_date'],
            'citation_event_source': 'Scholix'
            })
  
    
    for pubdoi in dataset['inCrossRef_notIn_scholix']:
        # find index of this pubdoi datadoi pair in crossref_df
        pub_indices = crossref_df_newColumns[crossref_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inCrossRef_notIn_scholix")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': crossref_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': crossref_df_newColumns.iloc[index]['data_title'],
            'data_Authors': crossref_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': crossref_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': crossref_df_newColumns.iloc[index]['publication_type'],
            'publication_title': crossref_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': crossref_df_newColumns.iloc[index]['publication_authors'],
            'publication_date': crossref_df_newColumns.iloc[index]['publication_date'],
            'citation_event_source': 'CrossRef'
            })
        
        
    for pubdoi in dataset['inDatacite_notIn_scholix_or_crossRef']:
        # find index of this pubdoi datadoi pair in datacite_df_newColumns
        pub_indices = datacite_df_newColumns[datacite_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inDatacite_notIn_scholix_or_crossRef")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': datacite_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': datacite_df_newColumns.iloc[index]['data_title'],
            'data_Authors': datacite_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': datacite_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': datacite_df_newColumns.iloc[index]['publication_type'],
            'publication_title': datacite_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': datacite_df_newColumns.iloc[index]['publication_authors'],
            'publication_date': datacite_df_newColumns.iloc[index]['publication_date'],
            'citation_event_source': 'DataCite'
            })

scholex_pub_dois
pub_indices:  Int64Index([], dtype='int64') data_indices:  Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
            85, 86, 87, 88],
           dtype='int64')
{
  "data_doi": "10.5285/a29c5465-b138-234d-e053-6c86abc040b9",
  "combined_unique_dois": [
    "10.2478/prolas-2022-0039",
    "10.5281/zenodo.4783659",
    "10.2478/jengeo-2020-0008",
    "10.1016/j.quascirev.2022.107377",
    "10.1016/j.jasrep.2020.102658",
    "10.2478/ouacsce-2020-0002",
    "10.1186/s40645-022-00473-8",
    "10.3929/ethz-b-000488372",
    "10.1098/rsos.220241",
    "10400.3/6524",
    "10.1002/essoar.10508505.2",
    "10.1016/j.jaesx.2022.100

In [10]:
data_citations = pd.DataFrame.from_dict(results)

### Get the citation string (APA format) of the publication that has cited the dataset

In [None]:
# # TAKES A LONG TIME - hours
# citationStrList = [] # create an empty list in which to put the citation strings

# for pubDOI in data_citations['publication_doi']:
#     if pubDOI.startswith('10.'):
#         r = requests.get(('https://doi.org/' + pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa", "Accept-Charset": "utf-8"})
#         #print(r.status_code)
#         citationStrList.append(r.text) # add the citation strings to the list
#     else:
#         citationStrList.append('not a doi')
    
# data_citations['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [None]:
# citationStrList = [] # create an empty list in which to put the citation strings

# for pubDOI in data_citations['publication_doi']:
#     if pubDOI.startswith('10.'):
#         print(pubDOI)
#         r = requests.get(("https://citation.crosscite.org/format?style=frontiers-of-biogeography&lang=en-GB&doi=" + pubDOI), headers={"Accept":"text/x-bibliography", "Accept-Charset": "utf-8"})
#         print(r.status_code)
#         encoded_citation = r.text
#         # add the citation strings to the list and Decode the author names assuming UTF-8 encoding
#         citationStrList.append(encoded_citation.encode('latin1').decode('utf-8')) 
#     else:
#         citationStrList.append('not a doi')
        
# data_citations['PubCitationStr'] = citationStrList # add the citation string list to df

In [11]:
from requests.exceptions import Timeout

citationStrList = []  # create an empty list to store citation strings
timeout_seconds = 10  # Define a timeout value for requests in seconds

for pubDOI in data_citations['publication_doi']:
    if pubDOI.startswith('10.'):
        print(pubDOI)
        try:
            # Set a timeout for the request
            r = requests.get(("https://citation.crosscite.org/format?style=frontiers-of-biogeography&lang=en-GB&doi=" + pubDOI),
                             headers={"Accept": "text/x-bibliography", "Accept-Charset": "utf-8"}, timeout=timeout_seconds)
            print(r.status_code)
            encoded_citation = r.text
            # Decode the author names assuming UTF-8 encoding
            citationStrList.append(encoded_citation.encode('latin1').decode('utf-8'))
        except Timeout:
            print(f"Request for {pubDOI} timed out.")
            citationStrList.append('request timed out')
        except Exception as e:
            print(f"An error occurred for {pubDOI}: {e}")
            citationStrList.append('error occurred')
    else:
        citationStrList.append('not a doi')


10.2478/prolas-2022-0039
200
10.5281/zenodo.4783659
200
10.2478/jengeo-2020-0008
200
10.1016/j.quascirev.2022.107377
200
10.1016/j.jasrep.2020.102658
200
10.2478/ouacsce-2020-0002
200
10.1186/s40645-022-00473-8
200
10.3929/ethz-b-000488372
200
10.1098/rsos.220241
200
10.1002/essoar.10508505.2
200
10.1016/j.jaesx.2022.100103
200
10.1007/s11001-022-09469-x
200
10.1186/s40645-022-00503-5
200
10.3390/jimaging8120317
200
10.1093/gji/ggab493
200
10.1038/s41467-022-35413-z
200
10.1029/2022gl100961
200
10.6084/m9.figshare.13515233
200
10.1111/mms.12978
200
10.1007/s11356-022-23021-9
200
10.3389/fmars.2021.791185
200
10.1186/s40623-022-01607-4
200
10.5281/zenodo.6922181
200
10.1139/cjes-2021-0010
200
10.1080/21664250.2021.1997506
200
10.1139/cjes-2021-0004
200
10.5194/bg-2021-244
200
10.1038/s41598-021-99668-0
200
10.1007/s10816-021-09534-6
200
10.1029/2021gc010283
200
10.1080/1755876x.2021.2000249
200
10.1080/21664250.2022.2145682
200
10.3390/rs13224628
200
10.1007/s12517-020-05972-w
200
10.11

In [12]:
# extra requested columns
data_citations['data_doi_url'] = 'doi.org/' + data_citations['data_doi']
data_citations['publication_doi_url'] = 'doi.org/' + data_citations['publication_doi']

In [13]:
today = date.today()
date_dict = {'year': today.year, 'month': today.month, 'date': today.day}

In [14]:
datacite_df2 = datacite_df[['data_doi', 'publicationYear']]
data_citations_merged = data_citations.merge(datacite_df2, left_on='data_doi', right_on='data_doi', how='left')
data_citations_merged['publicationYear'] = data_citations_merged['publicationYear'].astype('Int64')
data_citations_merged = data_citations_merged.fillna(np.nan).replace([np.nan], [None])

data_citations_merged = data_citations_merged.drop_duplicates(subset=['data_doi', 'publication_doi'])
# data_citations_merged
data_citations = data_citations_merged

## Output json and csv file

In [15]:
# Output csv file
today = date.today()

results_folder_path = "Results/v2/"
file_name = 'dataCitations_allSourcesMerged_retrieved_' + (today.strftime("%d%m%Y"))

data_citations_csvfilename = results_folder_path + file_name + '.csv'
data_citations.to_csv(data_citations_csvfilename, index = False)
print(data_citations_csvfilename)

# write data to 'latest_results' csv file
latest_file_name = results_folder_path + 'latest_results' + '.csv'
data_citations.to_csv(latest_file_name, index = False)


# write data to 'latest_results' json file with data publisher as top level key
latest_file_name_json = results_folder_path + 'latest_results' + '.json'

# Group by 'data_Publisher' and convert the DataFrame to a nested dictionary
nested_dict = data_citations.groupby('data_Publisher').apply(
    lambda x: x.drop('data_Publisher', axis=1).to_dict(orient='records')
).to_dict()

# Convert the nested dictionary to a JSON object
import json
json_object = json.dumps(nested_dict)

# Save the JSON object to a file
with open(latest_file_name_json, 'w') as f:
    f.write(json_object)
    
data_citations_jsonfilename = results_folder_path + file_name + '.json'

with open(data_citations_jsonfilename, 'w') as f:
    f.write(json_object)


Results/v2/dataCitations_allSourcesMerged_retrieved_26042024.csv
