### NERC dataset citations - part 2
Takes results from nerc_dataset_citations_part1.ipynb, 
Process and merge the results.
Produces a csv and json with details of the citations for NERC published datasets.

In [1]:
import requests, time, json, re, datetime, os, sys
import numpy as np
import pandas as pd
from math import ceil
from datetime import date

In [2]:
scholex_df = pd.read_csv("Results/Intermediate data/latest_results_scholix.csv")
crossref_df = pd.read_csv("Results/Intermediate data/latest_results_crossRef.csv")
datacite_df = pd.read_csv("Results/Intermediate data/latest_results_dataCite.csv")

In [3]:
# process publication date columns - scholex is ok 

def process_date(date):
    if date.startswith('[['):
        # If it's a date string, extract the year, month, and day
        parts = date.strip('[]').split(', ')
        year = int(parts[0])
        month = int(parts[1])
        # If day is provided
        if len(parts) == 3:
            day = int(parts[2])
        else:
            # If day is not provided, assume it's the first
            day = 1
        # Format the date as yyyy-mm-dd
        return f"{year}-{month:02d}-{day:02d}"
    else:
        # If it's not a date string (e.g., 'Info not given'), return it as is
        return date

# Apply the function to the 'pub_date' column and create a new column 'pub_date_processed'
crossref_df['pub_date_processed'] = crossref_df['pub_date'].apply(process_date)

datacite_df['pub_date_processed'] = datacite_df['pub_date'].apply(process_date)



In [4]:
## Remove https bits from crossref df DOIs
# remove url bit from subj_id
crossref_doi_list = []
for url in crossref_df['subj_id']:
    doi = url.replace('https://doi.org/','')
    crossref_doi_list.append(doi)
crossref_df['subj_doi'] = crossref_doi_list

# remove url bit from 'obj_id'
crossref_doi_list = []
for url in crossref_df['obj_id']:
    temp = url.split('/')
    crossref_doi_list.append(temp[3] + "/" + temp[4])
crossref_df['obj_doi'] = crossref_doi_list

In [5]:
# process the three dataframes make columns match
# columns should be:
# can add event_source, dates, publication_publisher columns later
newColumns = ['data_publisher', 'data_doi', 'data_title', 'data_authors', 'relation_type_id', 'publication_doi', 'publication_type', 'publication_title', 'publication_authors', 'publication_date']

crossref_column_list = [crossref_df['dataset_publisher_processed'],crossref_df['obj_doi'],crossref_df['dataset_Title'],crossref_df['dataset_authors'],crossref_df['relation_type_id'],crossref_df['subj_doi'],crossref_df['subj_work_type_id'],crossref_df['pub_Title'],crossref_df['pub_authors'],crossref_df['pub_date_processed']]
crossref_df_newColumns = pd.concat(crossref_column_list, axis = 1)
crossref_df_newColumns.columns = newColumns

# scholex_column_list = [scholex_df[['datasetPublisher']],scholex_df[['datasetDOI']],scholex_df[['datasetTitle']],scholex_df[['datasetAuthors_processed']],scholex_df[['relationshipType']],scholex_df[['pubID']],scholex_df[['PubType']],scholex_df[['pubTitle']],scholex_df[['pubAuthors_processed']]]
scholex_column_list = [scholex_df['datasetPublisher'],scholex_df['datasetDOI'],scholex_df['datasetTitle'],scholex_df['datasetAuthors'],scholex_df['relationshipType'],scholex_df['pubID'],scholex_df['PubType'],scholex_df['pubTitle'],scholex_df['pubAuthors_processed'],scholex_df['pubDate']]
scholex_df_newColumns = pd.concat(scholex_column_list, axis = 1)
scholex_df_newColumns.columns = newColumns

datacite_column_list = [datacite_df['data_publisher'], datacite_df['data_doi'], datacite_df['data_title'], datacite_df['data_authors'], datacite_df['relation-type-id'], datacite_df['pub_doi'], datacite_df['publisher'], datacite_df['pub_Title'], datacite_df['pub_authors'],datacite_df['pub_date_processed']] 
datacite_df_newColumns = pd.concat(datacite_column_list, axis = 1)
datacite_df_newColumns.columns = newColumns


In [6]:
# create single list of data dois 
scholix_doi_list = list(scholex_df_newColumns['data_doi'])
crossref_doi_list = list(crossref_df_newColumns['data_doi'])
datacite_doi_list = list(datacite_df_newColumns['data_doi'])

data_doi_list = scholix_doi_list + crossref_doi_list + datacite_doi_list

# remove duplicates = convert to dict and back to list again auto removes dups
data_doi_list_unique = list( dict.fromkeys(data_doi_list))

In [7]:
# loop through list of data dois, check pub DOI in each of crossref scholex and datacite dfs - compare result
comparison_dicts = []
data_doi_df = pd.DataFrame(data_doi_list_unique)
for doi in data_doi_df[0]:
    doi = doi.replace(")","") # remove rogue brackets
    scholex_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(doi)].index
    scholex_matches = scholex_df_newColumns['publication_doi'].iloc[scholex_indices].tolist()
    
    crossref_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(doi)].index
    crossref_matches = crossref_df_newColumns['publication_doi'].iloc[crossref_indices].tolist()
    
    datacite_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(doi)].index
    datacite_matches = datacite_df_newColumns['publication_doi'].iloc[datacite_indices].tolist()
    
    combined = scholex_matches + crossref_matches + datacite_matches
    combined_unique = list(dict.fromkeys(combined))
    
    inScholix_notIn_crossRef = list(set(scholex_matches) - set(crossref_matches))
    inCrossRef_notIn_scholix = list(set(crossref_matches) - set(scholex_matches))
    inDatacite_notIn_scholix_or_crossRef = list(set(datacite_matches) - set(scholex_matches) - set(crossref_matches))
    
    comparison_dicts.append({
        'data_doi': doi,
        'combined_unique_dois': combined_unique,
        'scholex_pub_dois': scholex_matches,
        'crossref_pub_dois': crossref_matches,
        'datacite_pub_dois': datacite_matches,
        'inScholix_notIn_crossRef':inScholix_notIn_crossRef,
        'inCrossRef_notIn_scholix':inCrossRef_notIn_scholix,
        'inDatacite_notIn_scholix_or_crossRef':inDatacite_notIn_scholix_or_crossRef
    })


In [8]:
# for combined_unique_dois create a final dataframe getting metadata  from dfs
results = []
for dataset in comparison_dicts:
    for pubdoi in dataset['scholex_pub_dois']:
        
        # find index of this pubdoi datadoi pair in scholex_df
        pub_indices = scholex_df_newColumns[scholex_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = scholex_df_newColumns[scholex_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("scholex_pub_dois")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': scholex_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': scholex_df_newColumns.iloc[index]['data_title'],
            'data_Authors': scholex_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': scholex_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': scholex_df_newColumns.iloc[index]['publication_type'],
            'publication_title': scholex_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': scholex_df_newColumns.iloc[index]['publication_authors'],
            'publication_date': scholex_df_newColumns.iloc[index]['publication_date'],
            'citation_event_source': 'Scholix'
            })
  
    
    for pubdoi in dataset['inCrossRef_notIn_scholix']:
        # find index of this pubdoi datadoi pair in crossref_df
        pub_indices = crossref_df_newColumns[crossref_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = crossref_df_newColumns[crossref_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inCrossRef_notIn_scholix")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': crossref_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': crossref_df_newColumns.iloc[index]['data_title'],
            'data_Authors': crossref_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': crossref_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': crossref_df_newColumns.iloc[index]['publication_type'],
            'publication_title': crossref_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': crossref_df_newColumns.iloc[index]['publication_authors'],
            'publication_date': crossref_df_newColumns.iloc[index]['publication_date'],
            'citation_event_source': 'CrossRef'
            })
        
        
    for pubdoi in dataset['inDatacite_notIn_scholix_or_crossRef']:
        # find index of this pubdoi datadoi pair in datacite_df_newColumns
        pub_indices = datacite_df_newColumns[datacite_df_newColumns.publication_doi.str.match(pubdoi)].index
        data_indices = datacite_df_newColumns[datacite_df_newColumns.data_doi.str.match(dataset['data_doi'])].index
        try:
            index = list((set(pub_indices) & set(data_indices)))[0]
        except Exception as e:
            print("inDatacite_notIn_scholix_or_crossRef")
            print("pub_indices: ", pub_indices, "data_indices: ", data_indices)
            print(json.dumps(dataset, indent=2))
            continue
        
        results.append({
            'data_Publisher': datacite_df_newColumns.iloc[index]['data_publisher'],
            'data_doi':dataset['data_doi'],
            'data_Title': datacite_df_newColumns.iloc[index]['data_title'],
            'data_Authors': datacite_df_newColumns.iloc[index]['data_authors'],
            'relation_type_id': datacite_df_newColumns.iloc[index]['relation_type_id'],
            'publication_doi':pubdoi,
            'publication_type': datacite_df_newColumns.iloc[index]['publication_type'],
            'publication_title': datacite_df_newColumns.iloc[index]['publication_title'],
            'publication_authors': datacite_df_newColumns.iloc[index]['publication_authors'],
            'publication_date': datacite_df_newColumns.iloc[index]['publication_date'],
            'citation_event_source': 'DataCite'
            })

scholex_pub_dois
pub_indices:  Int64Index([], dtype='int64') data_indices:  Int64Index([31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
            48, 49, 50, 51, 52, 53, 54, 55, 56, 57],
           dtype='int64')
{
  "data_doi": "10.5285/5dc179dc-f692-49ba-9326-a6893a503f6e",
  "combined_unique_dois": [
    "10.1016/j.scitotenv.2018.12.108",
    "10.1016/j.scitotenv.2016.03.066",
    "10.5194/hess-25-5517-2021",
    "10.1016/j.envsoft.2018.07.006",
    "10.1088/1748-9326/aac78c",
    "10.1080/1747423x.2018.1537312",
    "10.1144/qjegh2017-051",
    "10.1016/j.scitotenv.2017.08.092",
    "1983/01cf1647-8606-496a-a257-aa1f6cce0bdd",
    "10.5194/hessd-11-11763-2014",
    "10.1002/joc.5336",
    "10.3389/frwa.2021.684982",
    "10.5194/gmd-12-765-2019",
    "10.5285/f2856ee8-da6e-4b67-bedb-590520c77b3c",
    "10.1029/2020wr028393",
    "10.1016/j.ejrh.2015.05.014",
    "10.1142/s2345737615500050",
    "10.1061/(asce)he.1943-5584.0001892",
    "10.5194/hess-21-1189-20

In [9]:
data_citations = pd.DataFrame.from_dict(results)

### Get the citation string (APA format) of the publication that has cited the dataset

In [10]:
# # TAKES A LONG TIME - hours
# citationStrList = [] # create an empty list in which to put the citation strings

# for pubDOI in data_citations['publication_doi']:
#     if pubDOI.startswith('10.'):
#         r = requests.get(('https://doi.org/' + pubDOI), headers={"Accept": "text/x-bibliography", "style": "apa", "Accept-Charset": "utf-8"})
#         #print(r.status_code)
#         citationStrList.append(r.text) # add the citation strings to the list
#     else:
#         citationStrList.append('not a doi')
    
# data_citations['PubCitationStr'] = citationStrList # add the citation string list to the Scholex df

In [11]:
# citationStrList = [] # create an empty list in which to put the citation strings

# for pubDOI in data_citations['publication_doi']:
#     if pubDOI.startswith('10.'):
#         print(pubDOI)
#         r = requests.get(("https://citation.crosscite.org/format?style=frontiers-of-biogeography&lang=en-GB&doi=" + pubDOI), headers={"Accept":"text/x-bibliography", "Accept-Charset": "utf-8"})
#         print(r.status_code)
#         encoded_citation = r.text
#         # add the citation strings to the list and Decode the author names assuming UTF-8 encoding
#         citationStrList.append(encoded_citation.encode('latin1').decode('utf-8')) 
#     else:
#         citationStrList.append('not a doi')
        
# data_citations['PubCitationStr'] = citationStrList # add the citation string list to df

In [12]:
# from requests.exceptions import Timeout

# citationStrList = []  # create an empty list to store citation strings
# timeout_seconds = 10  # Define a timeout value for requests in seconds

# for pubDOI in data_citations['publication_doi']:
#     if pubDOI.startswith('10.'):
#         print(pubDOI)
#         try:
#             # Set a timeout for the request
#             r = requests.get(("https://citation.crosscite.org/format?style=frontiers-of-biogeography&lang=en-GB&doi=" + pubDOI),
#                              headers={"Accept": "text/x-bibliography", "Accept-Charset": "utf-8"}, timeout=timeout_seconds)
#             print(r.status_code)
#             encoded_citation = r.text
#             # Decode the author names assuming UTF-8 encoding
#             citationStrList.append(encoded_citation.encode('latin1').decode('utf-8'))
#         except Timeout:
#             print(f"Request for {pubDOI} timed out.")
#             citationStrList.append('request timed out')
#         except Exception as e:
#             print(f"An error occurred for {pubDOI}: {e}")
#             citationStrList.append('error occurred')
#     else:
#         citationStrList.append('not a doi')


In [13]:
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError, Timeout
from urllib3.util.retry import Retry

citationStrList = []  # create an empty list to store citation strings
timeout_seconds = 10  # Define a timeout value for requests in seconds

# Create a session with a custom adapter and retry configuration
session = requests.Session()

# Define a retry strategy with a backoff factor
retry_strategy = Retry(
    total=5,  # Maximum number of retry attempts
    backoff_factor=2,  # Factor by which to multiply the sleep time between retries
    status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes that trigger a retry
)
adapter = HTTPAdapter(max_retries=retry_strategy)  # Apply the retry strategy
session.mount('https://', adapter)

for pubDOI in data_citations['publication_doi']:
    if pubDOI.startswith('10.'):
        print(pubDOI)
        result = None
        try:
            # Set a timeout for the request
            r = session.get(("https://citation.crosscite.org/format?style=frontiers-of-biogeography&lang=en-GB&doi=" + pubDOI),
                            headers={"Accept": "text/x-bibliography", "Accept-Charset": "utf-8"}, timeout=timeout_seconds)
            print(r.status_code)
            encoded_citation = r.text
            # Decode the author names assuming UTF-8 encoding
            result = encoded_citation.encode('latin1').decode('utf-8')
        except Timeout:
            print(f"Request for {pubDOI} timed out.")
        except ConnectionError as ce:
            print(f"A connection error occurred for {pubDOI}: {ce}")
        except Exception as e:
            print(f"An error occurred for {pubDOI}: {e}")
        finally:
            citationStrList.append(result if result is not None else 'error occurred')
    else:
        citationStrList.append('not a doi')

data_citations['PubCitationStr'] = citationStrList # add the citation string list to df
print('Done!')

10.1080/01431161.2021.1988185
200
10.5194/gmd-14-7287-2021
200
10.1007/s12665-022-10721-1
200
10.5285/37702a54-b7a4-40ff-b62e-d14b161b69ca
200
10.1109/tgrs.2022.3175256
200
10.5194/gmd-14-7287-2021
200
10.5194/essd-13-1737-2021
200
10.5194/gmd-14-7287-2021
200
10.5285/5060cc27-0b5b-471b-86eb-71f96da0c80f
200
10.5194/hess-25-2445-2021
200
10.1016/j.catena.2023.107058
200
10.1016/j.jhydrol.2023.129118
200
10.1007/s12061-021-09397-0
200
10.5194/bg-16-1641-2019
200
10.1007/s10980-020-01059-9
200
10.5194/gmd-12-765-2019
200
10.2166/bgs.2022.021
200
10.3389/fsufs.2019.00042
200
10.5194/essd-12-2459-2020
200
10.1098/rsta.2021.0292
200
10.1016/j.scitotenv.2018.08.190
200
10.5194/gmd-15-1913-2022
200
10.1016/j.ecolind.2022.108719
200
10.1038/s43247-021-00334-0
200
10.5285/10874370-bc58-4d23-a118-ea07df8a07f2
200
10.3390/agronomy11020314
200
10.5285/2ab15bf0-ad08-415c-ba64-831168be7293
200
10.5194/essd-2017-137
200
10.5194/bg-16-1641-2019
200
10.1016/j.jenvman.2020.110550
200
10.1088/1748-9326/a

In [14]:
# extra requested columns
data_citations['data_doi_url'] = 'doi.org/' + data_citations['data_doi']
data_citations['publication_doi_url'] = 'doi.org/' + data_citations['publication_doi']

In [15]:
today = date.today()
date_dict = {'year': today.year, 'month': today.month, 'date': today.day}

In [16]:
datacite_df2 = datacite_df[['data_doi', 'publicationYear']]
data_citations_merged = data_citations.merge(datacite_df2, left_on='data_doi', right_on='data_doi', how='left')
data_citations_merged['publicationYear'] = data_citations_merged['publicationYear'].astype('Int64')
data_citations_merged = data_citations_merged.fillna(np.nan).replace([np.nan], [None])

data_citations_merged = data_citations_merged.drop_duplicates(subset=['data_doi', 'publication_doi'])
# data_citations_merged
data_citations = data_citations_merged

## Output json and csv file

In [17]:
# Output csv file
today = date.today()

results_folder_path = "Results/v2/"
file_name = 'dataCitations_allSourcesMerged_retrieved_' + (today.strftime("%d%m%Y"))

data_citations_csvfilename = results_folder_path + file_name + '.csv'
data_citations.to_csv(data_citations_csvfilename, index = False)
print(data_citations_csvfilename)

# write data to 'latest_results' csv file
latest_file_name = results_folder_path + 'latest_results' + '.csv'
data_citations.to_csv(latest_file_name, index = False)


# write data to 'latest_results' json file with data publisher as top level key
latest_file_name_json = results_folder_path + 'latest_results' + '.json'

# Group by 'data_Publisher' and convert the DataFrame to a nested dictionary
nested_dict = data_citations.groupby('data_Publisher').apply(
    lambda x: x.drop('data_Publisher', axis=1).to_dict(orient='records')
).to_dict()

# Convert the nested dictionary to a JSON object
import json
json_object = json.dumps(nested_dict)

# Save the JSON object to a file
with open(latest_file_name_json, 'w') as f:
    f.write(json_object)
    
data_citations_jsonfilename = results_folder_path + file_name + '.json'

with open(data_citations_jsonfilename, 'w') as f:
    f.write(json_object)


Results/v2/dataCitations_allSourcesMerged_retrieved_30042024.csv
