In [2]:
import pandas as pd
import os
import string
from get_biblatex import GetBiblatex
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
from ast import literal_eval
from collections import defaultdict
from semanticscholar import SemanticScholar

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
''' KM: Remove this fn from here. Dre to update GenerateCSVFile with method like this which handles no-doi items
def remove_blacklist_items(df_new_items):
    blacklisted_items = pd.read_csv("./script_data/blacklist.csv")
    initial_length = len(df_new_items)
    df_new_items = df_new_items[~df_new_items['ss_doi'].isin(blacklisted_items['doi'].unique().tolist())] # remove blacklisted dois
    df_new_items = df_new_items[~df_new_items['ss_id'].isin(blacklisted_items['ss_id'].unique().tolist())] # remove blacklisted dois

    print(f"{initial_length-len(df_new_items)} items removed from newly found items.")
    return df_new_items
'''

' KM: Remove this fn from here. Dre to update GenerateCSVFile with method like this which handles no-doi items\ndef remove_blacklist_items(df_new_items):\n    blacklisted_items = pd.read_csv("./script_data/blacklist.csv")\n    initial_length = len(df_new_items)\n    df_new_items = df_new_items[~df_new_items[\'ss_doi\'].isin(blacklisted_items[\'doi\'].unique().tolist())] # remove blacklisted dois\n    df_new_items = df_new_items[~df_new_items[\'ss_id\'].isin(blacklisted_items[\'ss_id\'].unique().tolist())] # remove blacklisted dois\n\n    print(f"{initial_length-len(df_new_items)} items removed from newly found items.")\n    return df_new_items\n'

In [3]:
def get_item_to_blacklist(item): # item here is a row from the manually checked csv file
    #Add item to blacklist.csv
    move_to_blacklist = {
        'staff_id': item.get('staff_id', None),
        'staff_name': item.get('staff_id', None),
        'ss_year': item.get('ss_year', None),
        'ss_id': item.get('ss_id', None),
        'title': item.get('ss_title', None),
        'doi': item.get('ss_doi', None),
        'Should be in diag.bib': 'no',
        'Reason': item.get('Blacklist reason', None)
    }

    return move_to_blacklist

In [4]:
def update_blacklist_csv(blacklist_df, blacklist_entries, blacklist_out_file): #blacklist_csv is a df
    # Add all items to blacklist.csv
    blacklist_df = pd.concat([blacklist_df, pd.DataFrame(blacklist_entries)], ignore_index=True)

    # Save blacklist.csv
    blacklist_df.to_csv(blacklist_out_file, index=False)
    return f"{len(blacklist_entries)} items added to blacklist"

In [5]:
# Code to get citations from semantic scholar. If there are multiple ss_ids, we should get the number of citations for each of them and sum the two (or more?) values.
def get_citations(semantic_scholar_ids):
    dict_cits = {}
    for ss_id in semantic_scholar_ids:
        sch = SemanticScholar()
        paper = sch.get_paper(ss_id)
        paper_id = paper['paperId']
        dict_cits[paper_id] = len(paper['citations'])
    return dict_cits

In [37]:
def get_bib_info(diag_bib_file, item): #diag_bib_file is the file read in as a string, item is row from csv
    #Get DOI information

    # if no ss_doi exists
    if len(str(item['ss_doi']))==0 or str(item['ss_doi'])=='nan':
        print('no ss_doi available, I cannot add new bib entry', item['ss_id'])
        return None
    
    # make sure doi is not already in diag.bib
    if item['ss_doi'] in diag_bib_file:

        start_index = diag_bib_file.find(item['ss_doi'])
        end_index = diag_bib_file.find('}', start_index)  # Include the closing brace
        matching_item_str = diag_bib_file[start_index:end_index]

        print('DOI already exists in bib file. Matching item:', matching_item_str)

        if matching_item_str == item['ss_doi']:
            print('doi already exists in bib file, I will not add new bib entry', item['ss_doi'], item['ss_id'])
            return None
        
        else:
            print('similar doi already exists in bib file, but new item will be added for ', item['ss_doi'], item['ss_id'])

    # Get BibLatex information based on DOI if not in the file
    reader = GetBiblatex(doi=item['ss_doi'], diag_bib=diag_bib_file)
    bibtext = reader.get_bib_text()

    # Return the bibtext if it is not 'empty', otherwise return None
    return bibtext if bibtext != 'empty' else None





In [7]:
def add_ss_id_to_existing_bibkey(diag_bib_raw, ss_id, bibkey):
    
    #Update bibkey with ss_id
    for ind, entry in enumerate(diag_bib_raw):
        if entry.type == 'string':
            continue

        # if we found the relevant key
        if bibkey == entry.key:
            # if there is already something in all_ss_ids
            if 'all_ss_ids' in entry.fields.keys():
                if not entry.fields['all_ss_ids'] == '{' + str(ss_id) + '}': # this should never happen, right? (from Keelin!)
                    previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                    new = ss_id
                    combined = list(set(previous) | set([new]))
                    # update the entry
                    entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            # if there is no ss_id here yet just add this single one
            else:   
                    entry.fields['all_ss_ids'] = '{' + str(ss_id) + '}'
            print(str(ss_id), 'added to diag_bib_raw')
            return [diag_bib_raw, 'Success']
        
    # if we haven't returned by now then we failed to update 
    print('failed to add ss_id to diag.bib', str(ss_id), str(bibkey))
    return [diag_bib_raw, 'Fail']

In [8]:
def update_citation_count(diag_bib_raw):
    num_entries = len(diag_bib_raw)
    for ind, entry in enumerate(diag_bib_raw):
        # print('checking citations', ind, 'of', num_entries)
        flag=0
        if entry.type == 'string':
            continue
        if 'all_ss_ids' in entry.fields:
            all_ss_ids = []
            ss_ids = entry.fields['all_ss_ids'].translate(str.maketrans('', '', string.punctuation)).split(' ')
            if len(ss_ids) > 1:
                all_ss_ids.extend(ss_ids)
            else:
                all_ss_ids.append(ss_ids[0])
            dict_cits = get_citations(all_ss_ids)
            n_cits = 0
            for key in dict_cits.keys():
                n_cits += dict_cits[key]
            # TODO: is it correct logic to use this field name or should we make a new one?
            if 'gscites' in entry.fields:
                # only update if we are increasing the number of citations!!!
                previous_cits = int(entry.fields['gscites'].strip('{}'))
                if n_cits > previous_cits:
                    print('updating', entry.key, 'from', previous_cits, 'to', n_cits)
                    entry.fields['gscites'] = '{' + str(n_cits) + '}'
                elif (previous_cits > (1.5 * n_cits)) and (previous_cits - n_cits > 10):
                    print('warning: num citations calculated for this bibkey is much lower than previously suggested....', entry.key, previous_cits, n_cits)
            else:
                print('adding gscites', entry.key, n_cits)
                entry.fields['gscites'] = '{' + str(n_cits) + '}'
    print('done updating citations')
    return diag_bib_raw

# Load manually checked csv file

In [31]:
# load manually_checked
manually_checked = pd.read_excel("./script_data/manual_check_20231018.xlsx")
# manually_checked = remove_blacklist_items(manually_checked)     # This should be done before actually manually checking

# load bib file just for reading at this point
#TODO: in the end when this script is routine this should just read the live diag.bib
cwd = os.getcwd()
parent_directory = os.path.dirname(cwd)
diag_bib_path = os.path.join(parent_directory, 'scripts/script_data/diag_ss.bib')
with open(diag_bib_path, 'r', encoding="utf8") as readonly_bib_file:
    diag_bib_readonly = readonly_bib_file.read()
    
# POTENTIAL TO-DO CREATE ACTION MAPPINGS


In [38]:
# Iterate through all items in the manually checked csv
blacklist_items = []
items_to_add = ''
items_to_update = []

failed_new_items = []
failed_updated_items = []
failed_to_find_actions = []

#TODO: Make sure new items or updated items in the bib-file include pmid and doi if they did not previously

for index, bib_item in manually_checked.iterrows():
    print(f"Working on {index}/{len(manually_checked)}")
    # Make sure item is manually checked
    if "," in bib_item['action']:
        print(f"{bib_item['ss_id']} has not been checked yet, make sure only 1 action is mentioned")
        failed_to_find_actions.append(bib_item)
        continue
        #TODO: we will later work from a dropdown-list rather than a comma separated set of actions so this probably will need updating

    # Add new item to diag.bib
    elif "add new item" in bib_item['action']:
       bib_item_text = get_bib_info(diag_bib_readonly, bib_item)

       if bib_item_text is not None:
           items_to_add += bib_item_text
       else:
           failed_new_items.append(bib_item)

    # Add ss_id to already existing doi in diag.bib
    elif "add ss_id" in bib_item['action']:
        # just store a list of these items for now and we will update the file at the end
        items_to_update += [bib_item]
        
    # Get items to blacklist
    elif "blacklist" in bib_item['action']:
        blacklist_item = get_item_to_blacklist(bib_item)
        blacklist_items.append(blacklist_item)

    # Get None items
    elif 'None' in bib_item['action']:
        continue
        
    else:
        print('failed to find action')
        failed_to_find_actions.append(bib_item)

Working on 0/280
Working on 1/280
Working on 2/280
Working on 3/280
Working on 4/280
Working on 5/280
Working on 6/280
Working on 7/280
Working on 8/280
Working on 9/280
Working on 10/280
Working on 11/280
Working on 12/280
20
Working on 13/280
Working on 14/280
308
Working on 15/280
Working on 16/280
Working on 17/280
Working on 18/280
Working on 19/280
Working on 20/280
Working on 21/280
Working on 22/280
Working on 23/280
Working on 24/280
Working on 25/280
Working on 26/280
Working on 27/280
Working on 28/280
Working on 29/280
Working on 30/280
Working on 31/280
DOI already exists in bib file. Matching item: 10.1109/TMI.2016.2553401
doi already exists in bib file, I will not add new bib entry 10.1109/TMI.2016.2553401 1d2109f8ec43c23db647c4778a5bb5846074e575
Working on 32/280
Working on 33/280
Working on 34/280
Working on 35/280
Working on 36/280
Working on 37/280
Working on 38/280
Working on 39/280
Working on 40/280
Working on 41/280
Working on 42/280
Working on 43/280
Working on 4

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Working on 83/280
8
Working on 84/280
Working on 85/280
Working on 86/280
Working on 87/280
Working on 88/280
Working on 89/280
Working on 90/280
Working on 91/280
Working on 92/280
Working on 93/280
Working on 94/280
Working on 95/280
Working on 96/280
Working on 97/280
Working on 98/280
Working on 99/280
Working on 100/280
Working on 101/280
Working on 102/280
Working on 103/280
Working on 104/280
Working on 105/280
16
Working on 106/280
Working on 107/280
Working on 108/280
failed to find action
Working on 109/280
Working on 110/280
failed to find action
Working on 111/280
Working on 112/280
Working on 113/280
Working on 114/280
Working on 115/280
Working on 116/280
17
Working on 117/280
Working on 118/280
Working on 119/280
Working on 120/280
Working on 121/280
Working on 122/280
Working on 123/280
Working on 124/280
Working on 125/280
Working on 126/280
Working on 127/280
Working on 128/280
Working on 129/280
Working on 130/280
Working on 131/280
Working on 132/280
Working on 133/

# Add new bib entries to the diag.bib file 

In [None]:
# First we use the bib file string, add the completely new bib entries and save it
# append the new items to the string
diag_bib_readonly += items_to_add  
# save the file to disk 
# TODO : write to correct location
diag_bib_path_tmp_new = os.path.join(parent_directory, 'scripts/script_data/diag_ss_tmp_new.bib')
with open(diag_bib_path_tmp_new, 'w', encoding="utf8") as bibtex_file:
    bibtex_file.write(diag_bib)

# Update existing bib entries with new ss_ids

In [None]:
# Second we re-open the bib file using the read_bibfile method and update existing items with new ss_ids
# TODO read from correct location here
diag_bib_raw = read_bibfile(None, diag_bib_path_tmp_new)
for item_to_update in items_to_update:
    [diag_bib_raw, result] = add_ss_id_to_existing_bibkey(diag_bib_raw, item_to_update["ss_id"], item_to_update["bibkey"])
    if(result=='Fail'):
        failed_updated_items.append(item_to_update)


#Note we are not writing the file yet as we will use the same diag_bib_raw and update the citations on it first


# Update citation counts

In [None]:
diag_bib_raw_new_cits = update_citation_count(diag_bib_raw)


In [None]:
# TODO: update to the correct output path
save_to_file(diag_bib_raw_new_cits, None, diag_bib_path_tmp_new)

# Update the blacklist

In [None]:
# Last we update the blacklist (temporarily commented) (what failures can happen here?)
blacklist_df = pd.read_csv('./script_data/blacklist.csv')
# TODO: fix to correct output location
blacklist_out_file = './script_data/blacklist_tmp_updated.csv'
# file writing
update_blacklist_csv(blacklist_df, blacklist_items, blacklist_out_file)

In [None]:
# TODO: Here we provide a report of rows where we did not know what to do or we failed to do the action
print("DONE with processing manually checked items")
print('Failures are as follows:')
for item in failed_new_items:
    print('Failed to add new bib entry ', item['ss_id'])
for item in failed_updated_items:
    print('Failed to update exiting bib entry with new ss_id', item['bibkey'], item['ss_id'])
for item in failed_to_find_actions:
    print('Failed to find valid action for item', item['ss_id'], item['action'])

In [None]:
count = items_to_add.count('{yes}')
print(f"Newly added items: {count}")

In [None]:
print(f"Blacklisted items: {len(blacklist_items)}")

In [141]:
diag_bib_raw_new_cits = update_citation_count(diag_bib_raw)


adding gscites Tell21 154
adding gscites Teuw18 0
adding gscites Thag23 2
adding gscites Thee20 0
adding gscites Thij23 0
updating Timp02 from 10 to 11
updating Timp10 from 24 to 35
updating Trom12 from 3 to 4
adding gscites Turn21 4
adding gscites Valk19a 16
updating Vare05 from 46 to 52
updating Veli08d from 4 to 6
updating Veli09a from 2 to 3
updating Veli12 from 30 to 34
updating Veli13 from 36 to 45
updating Velz20 from 10 to 121
updating Ven11a from 8 to 12
updating Ven13b from 40 to 45
updating Ven16a from 15 to 21
updating Ven16f from 6 to 7
updating Vend17c from 18 to 36
adding gscites Vend18 54
adding gscites Venh15a 0
updating Venh15b from 45 to 72
adding gscites Venh15c 13
adding gscites Venh16a 11
updating Venh17a from 37 to 89
updating Venh17b from 59 to 112
adding gscites Venh18 112
adding gscites Venk21 51
adding gscites Venk23 1
adding gscites Vent20 2
adding gscites Vent21 16
adding gscites Vent23 6
updating Veta18 from 56 to 205
adding gscites Vina22 3
adding gscites

In [143]:
# TODO: update to the correct output path
save_to_file(diag_bib_raw_new_cits, None, diag_bib_path_tmp_new)

# Update the blacklist

In [142]:
# Last we update the blacklist (temporarily commented) (what failures can happen here?)
blacklist_df = pd.read_csv('./script_data/blacklist.csv')
# TODO: fix to correct output location
blacklist_out_file = './script_data/blacklist_tmp_updated.csv'
# file writing
update_blacklist_csv(blacklist_df, blacklist_items, blacklist_out_file)

'39 items added to blacklist'

In [135]:
# TODO: Here we provide a report of rows where we did not know what to do or we failed to do the action
print("DONE with processing manually checked items")
print('Failures are as follows:')
for item in failed_new_items:
    print('Failed to add new bib entry ', item['ss_id'])
for item in failed_updated_items:
    print('Failed to update exiting bib entry with new ss_id', item['bibkey'], item['ss_id'])
for item in failed_to_find_actions:
    print('Failed to find valid action for item', item['ss_id'], item['action'])

DONE with processing manually checked items
Failures are as follows:
Failed to add new bib entry  03ad8d7078805db6fbd4993b881045b462b4e028
Failed to add new bib entry  06ee6ed85131848ef70da625806ba480915aa2e0
Failed to add new bib entry  0b78520bea8310ff375cf953bbde10082db0eede
Failed to add new bib entry  0df7a4f26d57eb58fe628316aa5e84e5ca474ee8
Failed to add new bib entry  0ebe8ab65571514718283cd2d8ac7277db3513c5
Failed to add new bib entry  0f27fc10d593859a440c6ccf901d5093f67939bd
Failed to add new bib entry  17b918178a85cdb670be7521e6cef3b4dbffb16b
Failed to add new bib entry  1d2109f8ec43c23db647c4778a5bb5846074e575
Failed to add new bib entry  202f393ad41b85acbc59a28e5080d19c9de56988
Failed to add new bib entry  233a8c1b929ccbb0f4a31720919e2b9f413a239c
Failed to add new bib entry  269e8609dff88d78e8e3c41f81a97199c9add3dd
Failed to add new bib entry  2f2182f8e55be5a85c1316cd1b181cd5c85c106c
Failed to add new bib entry  32af51ced47419cff26fde66cce602fbab2f238a
Failed to add new bib

In [None]:
count = items_to_add.count('{yes}')
print(f"Newly added items: {count}")

In [None]:
print(f"Blacklisted items: {len(blacklist_items)}")