In [None]:
import pandas as pd
from datetime import date
from citations_fun.mergeCitations import merge_citation_dfs
from citations_fun.getCitationString import get_citation_str
from citations_fun.filterCitations import filterCitations

# load results - datacite, scholex and overton
dataCite_df = pd.read_csv("Results/intermediate_data/latest_results_dataCite.csv")
scholex_df = pd.read_csv("Results/intermediate_data/latest_results_scholex.csv")
overton_df = pd.read_csv("Results/intermediate_data/latest_results_overton.csv")

In [None]:
# merge all results
df_list = [dataCite_df, scholex_df, overton_df]
nerc_citations_df = merge_citation_dfs(df_list)

In [None]:
# remove doi.org for 
nerc_citations_df['pub_doi'] = nerc_citations_df['pub_doi'].str.replace(
    'https://doi.org/', '', regex=False
)

In [None]:
# filtering - keep the results that are filtered out for later checks

# returns list of 2 dataframes, first is the main df, second is everything that has been filtered out
results = filterCitations(nerc_citations_df)
nerc_citations_df_filtered = results[0]
filtered_out_df = results[1]

# write to file for a record
filtered_out_df.to_csv("Results/v3/filtered_out_df.csv", index= False)

In [None]:
# get citation string - takes about 2 hours
nerc_citations_df = get_citation_str(nerc_citations_df_filtered) 

In [None]:
# add 'doi.org/' to data and pub dois in new columns
nerc_citations_df['data_doi_url'] = 'doi.org/' + nerc_citations_df['data_doi']
# need extra logic here as overton pub_doi column usually just a normal url
nerc_citations_df['publication_doi_url'] = nerc_citations_df['pub_doi'].apply(
    lambda x: f"doi.org/{x}" if x.startswith("10.") else x
)


In [None]:
# create publicationYear column from pub_date
def pub_year_splitter(date):
    if date == "Info not given":
        return None
    else:
        try:
            return date.split('/')[2]
        except:
            try:
                return date.split('-')[0]
            except:
                print(date)
                return None

nerc_citations_df['publicationYear'] = nerc_citations_df['pub_date'].apply(pub_year_splitter)

In [None]:
# map to excected column names for API schema 
# data_Publisher	data_doi	data_Title	data_Authors	relation_type_id publication_doi	publication_type	publication_title	publication_authors	publication_date	citation_event_source	PubCitationStr data_doi_url	publication_doi_url	publicationYear

cols = {
    # 'old':'new',
    'relation_type':'relation_type_id', 
    'pub_doi':'publication_doi', 'pub_title':'publication_title', 'pub_date':'publication_date',
    'pub_authors':'publication_authors', 'source_id':'citation_event_source', 'pub_type':'publication_type',
    'pub_citation_str':'PubCitationStr'
}

nerc_citations_df_renamed = nerc_citations_df.rename(columns=cols)

In [None]:
# add index date_added

# read last week's result
old_results = pd.read_csv("Results/v3/latest_results.csv")

# Prepare a mapping of old pairs and date_added
date_map = (
    old_results[['data_doi', 'publication_doi', 'date_added']]
    .drop_duplicates(subset=['data_doi', 'publication_doi'])
    .set_index(['data_doi', 'publication_doi'])['date_added']
)

# map old dates onto new df
nerc_citations_df_renamed['date_added'] = nerc_citations_df_renamed.set_index(['data_doi', 'publication_doi']).index.map(date_map)

# Fill in today's date where no old date exists
nerc_citations_df_renamed['date_added'] = nerc_citations_df_renamed['date_added'].fillna(str(date.today()))




In [None]:
# write to csv and json file
results_folder_path = "Results/v3/"
#Results/v3
#/home/matnic/Projects/citationNotebook/Results/v3

latest_file_name_csv = results_folder_path + 'latest_results.csv'
nerc_citations_df_renamed.to_csv(latest_file_name_csv, index= False)

# write data to 'latest_results' json file with data publisher as top level key
latest_file_name_json = results_folder_path + 'latest_results.json'

# Group by 'data_Publisher' and convert the DataFrame to a nested dictionary
nested_dict = nerc_citations_df_renamed.groupby('data_publisher').apply(
    lambda x: x.drop('data_publisher', axis=1).to_dict(orient='records')
).to_dict()

# Convert the nested dictionary to a JSON object
import json
json_object = json.dumps(nested_dict)

# Save the JSON object to a file
with open(latest_file_name_json, 'w') as f:
    f.write(json_object)
    

Development stuff below here:

In [None]:
# # code to filter out things like if pub_doi contians "egusphere" etc? - always a conference abstract
# count = 0
# for pub_doi in nerc_citations_df['pub_doi']:
#     if "egusphere" in pub_doi:
#         print(pub_doi)
#         count =count + 1
# print(count)



In [None]:
# nerc_citations_df['pub_citation_str'].value_counts()

In [None]:
# nerc_citations_df["pub_type"].value_counts()


In [None]:
# # take data_doi and pub_doi columns from each df, concatenate vertically
# doi_df = pd.concat([scholex_df[['data_doi', 'pub_doi']], dataCite_df[['data_doi', 'pub_doi']], overton_df[['data_doi', 'pub_doi']]], ignore_index=True)
# doi_df_unique = doi_df.drop_duplicates(subset=['data_doi', 'pub_doi'])
# print(len(doi_df))
# print(len(doi_df_unique))