In [2]:
import pandas as pd
import os
import string
from automatic_update.get_biblatex import GetBiblatex
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
from ast import literal_eval
from collections import defaultdict
from semanticscholar import SemanticScholar, SemanticScholarException

In [3]:
def get_item_to_blacklist(item): # item here is a row from the manually checked csv file
    #Add item to blacklist.csv
    move_to_blacklist = {
        'staff_id': item.get('staff_id', None),
        'staff_name': item.get('staff_id', None),
        'ss_year': item.get('ss_year', None),
        'ss_id': item.get('ss_id', None),
        'title': item.get('ss_title', None),
        'doi': item.get('ss_doi', None),
        'Should be in diag.bib': 'no',
        'Reason': item.get('Blacklist reason', None)
    }

    return move_to_blacklist

In [4]:
def update_blacklist_csv(blacklist_df, blacklist_entries, blacklist_out_file): #blacklist_csv is a df
    # Add all items to blacklist.csv
    blacklist_df = pd.concat([blacklist_df, pd.DataFrame(blacklist_entries)], ignore_index=True)

    # Save blacklist.csv
    blacklist_df.to_csv(blacklist_out_file, index=False)
    return f"{len(blacklist_entries)} items added to blacklist"

In [5]:
# Code to get citations from semantic scholar. If there are multiple ss_ids, we should get the number of citations for each of them and sum the two (or more?) values.
def get_citations(semantic_scholar_ids, sch):
    dict_cits = {}
    ss_ids_not_found = []
    for ss_id in semantic_scholar_ids:
        tries = 8
        i=0
        while i<tries:
            print('trying time', i, ss_id)
            try:
                paper = sch.get_paper(ss_id)
                paper_id = paper['paperId']
                dict_cits[paper_id] = len(paper['citations'])
                print('success getting citations')
                i=tries # we succeeded so max out the tries
            except SemanticScholarException.ObjectNotFoundException as onfe:
                ss_ids_not_found.append(ss_id)
                print('failed cleanly to get citations')
                i=tries # we failed cleanly so max out the tries
            except Exception as e: # some kind of time out error
                print('failed to get citations, trying again')
                i = i+1  # if we still have more tries left then try it again
            
    return dict_cits, ss_ids_not_found

In [10]:
def get_bib_info(diag_bib_file, item): #diag_bib_file is the file read in as a string, item is row from csv
    #Get DOI information

    # if no ss_doi exists
    if len(str(item['ss_doi']))==0 or str(item['ss_doi'])=='nan':
        print('no ss_doi available, I cannot add new bib entry', item['ss_id'])
        return None
    
    # make sure doi is not already in diag.bib
    if item['ss_doi'] in diag_bib_file:

        start_index = diag_bib_file.find(item['ss_doi'])
        end_index = diag_bib_file.find('}', start_index)  # Include the closing brace
        matching_item_str = diag_bib_file[start_index:end_index]

        print('DOI already exists in bib file. Matching item:', matching_item_str)

        if matching_item_str == item['ss_doi']:
            print('doi already exists in bib file, I will not add new bib entry', item['ss_doi'], item['ss_id'])
            return None
        
        else:
            print('similar doi already exists in bib file, but new item will be added for ', item['ss_doi'], item['ss_id'])

    # Get BibLatex information based on DOI if not in the file
    reader = GetBiblatex(doi=item['ss_doi'], diag_bib=diag_bib_file)
    bibtext = reader.get_bib_text()

    # Return the bibtext if it is not 'empty', otherwise return None
    return bibtext if bibtext != 'empty' else None

In [7]:
def add_ss_id_doi_pmid_to_existing_bibkey(diag_bib_raw, item_row):
    ss_id = item_row['ss_id']
    bibkey = item_row['bibkey']
    #Update bibkey with ss_id
    for ind, entry in enumerate(diag_bib_raw):
        if entry.type == 'string':
            continue

        # if we found the relevant key
        if bibkey == entry.key:
            # print('entry matched is ', entry.fields)
            # if there is already something in all_ss_ids
            if 'all_ss_ids' in entry.fields.keys():
                if not entry.fields['all_ss_ids'] == '{' + str(ss_id) + '}': # this should never happen, right? (from Keelin!)
                    previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                    new = ss_id
                    combined = list(set(previous) | set([new]))
                    # update the entry
                    entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            # if there is no ss_id here yet just add this single one
            else:   
                    entry.fields['all_ss_ids'] = '{[' + str(ss_id) + ']}'
            print(str(ss_id), 'added to diag_bib_raw')

            ss_doi = str(item_row['ss_doi']).strip()
            if not 'doi' in entry.fields.keys() and len(ss_doi)>0:
                print('will add doi to bibkey', bibkey,  ss_doi)
                entry.fields['doi'] = '{' + ss_doi + '}'
            ss_pmid = item_row['ss_pmid'].strip()
            if not 'pmid' in entry.fields.keys() and len(ss_pmid)>0:
                print('will add pmid to bibkey', bibkey,  ss_pmid)
                entry.fields['pmid'] = '{' + ss_pmid + '}'


            return [diag_bib_raw, 'Success']
        
    # if we haven't returned by now then we failed to update 
    print('failed to add ss_id to diag.bib', str(ss_id), str(bibkey))
    return [diag_bib_raw, 'Fail']

In [11]:
def add_ss_id_and_pmid_where_possible(diag_bib_raw, dict_new_items_bibkey_ss_id_and_pmid):
    # iterate through all items in the diag bib and update them if we have missing information on them
    for ind, entry in enumerate(diag_bib_raw):
        if entry.type == 'string':
            continue

        # if we found the relevant key
        current_bibkey = entry.key
        if current_bibkey in dict_new_items_bibkey_ss_id_and_pmid.keys():
            ss_id = dict_new_items_bibkey_ss_id_and_pmid[current_bibkey]['ss_id'].strip()
            print('will add ss_id to bibkey', current_bibkey, ss_id)
            if 'all_ss_ids' in entry.fields.keys():
                if not entry.fields['all_ss_ids'] == '{' + str(ss_id) + '}': # this should never happen, right? (from Keelin!)
                    previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                    new = ss_id
                    combined = list(set(previous) | set([new]))
                    # update the entry
                    entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            # if there is no ss_id here yet just add this single one
            else:   
                    entry.fields['all_ss_ids'] = '{[' + str(ss_id) + ']}'
                    
            if not 'pmid' in entry.fields.keys() and 'ss_pmid' in dict_new_items_bibkey_ss_id_and_pmid[current_bibkey].keys():
                print('will add pmid to bibkey', dict_new_items_bibkey_ss_id_and_pmid[current_bibkey]['ss_pmid'])
                entry.fields['pmid'] = '{' + dict_new_items_bibkey_ss_id_and_pmid[current_bibkey]['ss_pmid'] + '}'

    return diag_bib_raw

In [12]:
def update_citation_count(diag_bib_raw):
    all_ss_ids_not_found = []
    num_entries = len(diag_bib_raw)

    sch = SemanticScholar(timeout=40)
    sch.timeout=40

    for ind, entry in enumerate(diag_bib_raw):
        # print('checking citations', ind, 'of', num_entries)
        flag=0
        if entry.type == 'string':
            continue
        if 'all_ss_ids' in entry.fields:
            all_ss_ids = []
            ss_ids = entry.fields['all_ss_ids'].translate(str.maketrans('', '', string.punctuation)).split(' ')
            if len(ss_ids) > 1:
                all_ss_ids.extend(ss_ids)
            else:
                all_ss_ids.append(ss_ids[0])
            print('trying with key', entry.key, 'and ss ids', all_ss_ids)
            dict_cits, ss_ids_not_found_this_item = get_citations(all_ss_ids, sch)
            if len(ss_ids_not_found_this_item)>0:
                print('adding items to ss_ids_not_found', ss_ids_not_found_this_item)
                all_ss_ids_not_found.extend(ss_ids_not_found_this_item)
            n_cits = 0
            for key in dict_cits.keys():
                n_cits += dict_cits[key]
            print('n_cits this item is ', n_cits)
            # TODO: is it correct logic to use this field name or should we make a new one?
            if 'gscites' in entry.fields:
                # only update if we are increasing the number of citations!!!
                previous_cits = int(entry.fields['gscites'].strip('{}'))
                if n_cits > previous_cits:
                    print('updating', entry.key, 'from', previous_cits, 'to', n_cits)
                    entry.fields['gscites'] = '{' + str(n_cits) + '}'
                elif (previous_cits > (1.5 * n_cits)) and (previous_cits - n_cits > 10):
                    print('warning: num citations calculated for this bibkey is much lower than previously suggested....', entry.key, previous_cits, n_cits)
                else:
                    print('will not update', entry.key, 'as there is no increase', n_cits, previous_cits)
            else:
                print('adding gscites', entry.key, n_cits)
                entry.fields['gscites'] = '{' + str(n_cits) + '}'
    print('done updating citations')
    return diag_bib_raw, all_ss_ids_not_found

# Load manually checked csv file and bib file

In [13]:
# load manually_checked
manually_checked = pd.read_excel("./script_data/manual_check_20231018.xlsx")
manually_checked['ss_pmid'] = manually_checked['ss_pmid'].fillna('-1')
manually_checked['ss_pmid'] = manually_checked['ss_pmid'].astype(int).astype(str)
manually_checked['ss_pmid'] = manually_checked['ss_pmid'].replace('-1', '')

manually_checked['ss_doi'] = manually_checked['ss_doi'].fillna('')


# load bib file just for reading at this point
#TODO: in the end when this script is routine this should just read the live diag.bib
cwd = os.getcwd()
parent_directory = os.path.dirname(cwd)
diag_bib_path = os.path.join(parent_directory, 'scripts/script_data/diag_ss.bib')
with open(diag_bib_path, 'r', encoding="utf8") as orig_bib_file:
    diag_bib_str = orig_bib_file.read()


In [None]:
# Iterate through all items in the manually checked csv
blacklist_items = []
num_items_added = 0
items_to_update = []

failed_new_items = []
failed_updated_items = []
failed_to_find_actions = []

dict_new_items_bibkey_ss_id_and_pmid = {}


for index, bib_item in manually_checked.iterrows():
    print(f"Working on {index}/{len(manually_checked)}: {bib_item['ss_doi']} (action is {bib_item['action']})")
    # Make sure item is manually checked
    if "," in bib_item['action']:
        print(f"{bib_item['ss_id']} has not been checked yet, make sure only 1 action is mentioned")
        failed_to_find_actions.append(bib_item)
        continue

    # Add new item to diag.bib
    elif "[add new item]" == bib_item['action'].strip():
       
       bib_item_text = get_bib_info(diag_bib_str, bib_item)

       # @Dre it seems like failure results in the return of the text 'empty' - could we return None instead? 
       if bib_item_text is not None:
           num_items_added += 1
           # update the diag_bib_str immediately so that the bibkey cannot be reused for future additions in this loop
           diag_bib_str = diag_bib_str + bib_item_text
           # if there is a pmid note it to be added afterwards
           ss_pmid = str(bib_item['ss_pmid']).strip()
           ss_id = str(bib_item['ss_id']).strip() # there must always be an ss_id
           # bit of a hacky way to get the bibkey of the added item
           bibkey_added = bib_item_text[bib_item_text.index('{')+1:bib_item_text.index(',')]
           dict_new_items_bibkey_ss_id_and_pmid[bibkey_added]={'ss_id':ss_id}
           if len(ss_pmid)>0:
               dict_new_items_bibkey_ss_id_and_pmid[bibkey_added]['ss_pmid'] = ss_pmid
           print('storing bibkey, ss_id and pmid', bibkey_added, ss_id, ss_pmid)
       else:
           print('failed to find details for doi, ss_id', bib_item['ss_doi'], bib_item['ss_id'])
           failed_new_items.append(bib_item)
       

    # Add ss_id to already existing doi in diag.bib
    elif "[add ss_id]" in bib_item['action'].strip():
        # just store a list of these items for now and we will update the file at the end
        items_to_update += [bib_item]
        
    # Get items to blacklist
    elif "blacklist" in bib_item['action'].strip():
        blacklist_item = get_item_to_blacklist(bib_item)
        blacklist_items.append(blacklist_item)

    # Get None items
    elif '[None]' in bib_item['action'].strip():
        continue
        
    else:
        print('failed to find action', bib_item['action'])
        failed_to_find_actions.append(bib_item)

# Save the file with new bib items in it

In [None]:
# save the file to disk 
# TODO : write to correct location
diag_bib_path_tmp_new = os.path.join(parent_directory, 'scripts/script_data/diag_ss_tmp_new.bib')
with open(diag_bib_path_tmp_new, 'w', encoding="utf8") as bibtex_file:
    bibtex_file.write(diag_bib_str)

# Update newly added items with ss_id and pmids where possible

In [None]:
# Next we re-open the bib file using the read_bibfile method and update newly added items with new pmids
# TODO read from correct location here
diag_bib_raw = read_bibfile(None, diag_bib_path_tmp_new)
diag_bib_raw = add_ss_id_and_pmid_where_possible(diag_bib_raw, dict_new_items_bibkey_ss_id_and_pmid)

# Update existing bib entries with new ss_ids (and dois, pmids where possible)

In [None]:
# Next we go over existing items that need ss_ids added (and possibly doi, pmid at same time)
for item_to_update in items_to_update:
    [diag_bib_raw, result] = add_ss_id_doi_pmid_to_existing_bibkey(diag_bib_raw, item_to_update)
    if(result=='Fail'):
        failed_updated_items.append(item_to_update)


#Note we can remove this write - it is just for safety /debug right now
save_to_file(diag_bib_raw, None, diag_bib_path_tmp_new)

# Update citation counts

In [None]:
ss_ids_not_found_for_citations =[]

In [None]:

diag_bib_raw_new_cits, ss_ids_not_found_for_citations = update_citation_count(diag_bib_raw)

In [None]:
# TODO: update to the correct output path
save_to_file(diag_bib_raw_new_cits, None, diag_bib_path_tmp_new)

# Update the blacklist

In [None]:
# Last we update the blacklist (temporarily commented) (what failures can happen here?)
blacklist_df = pd.read_csv('./script_data/blacklist.csv')
# TODO: fix to correct output location
blacklist_out_file = './script_data/blacklist_tmp_updated.csv'
# file writing
update_blacklist_csv(blacklist_df, blacklist_items, blacklist_out_file)

In [None]:
# TODO: Here we provide a report of rows where we did not know what to do or we failed to do the action
print("DONE with processing manually checked items")
print('Failures are as follows:')
for item in failed_new_items:
    print('Failed to add new bib entry ', item['ss_id'])
for item in failed_updated_items:
    print('Failed to update exiting bib entry with new ss_id', item['bibkey'], item['ss_id'])
for item in failed_to_find_actions:
    print('Failed to find valid action for item', item['ss_id'], item['action'])
for item in ss_ids_not_found_for_citations:
    print('Failed to find this ss_id to update citations', item)

In [None]:
print(f"Blacklisted items: {len(blacklist_items)}")
print(f"Updated items: {len(items_to_update)}")
print(f"Newly added items: {num_items_added}")
import numpy as np
count_action_none = np.sum(np.fromiter(('none' in str(action).lower() for action in manually_checked['action']), dtype=bool))
print(f"Items with action None: {count_action_none}")


print(f"total processed items: {len(blacklist_items) + len(items_to_update) + num_items_added + len(failed_new_items) + len(failed_updated_items) + len(failed_to_find_actions) + count_action_none}")
print(f"amount of items in manual checkfile: {manually_checked.shape[0]}")


In [None]:
# TODO: update to the correct output path
save_to_file(diag_bib_raw_new_cits, None, diag_bib_path_tmp_new)