In [None]:
import pandas as pd
import os
import string
from get_biblatex import GetBiblatex
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
from ast import literal_eval
from collections import defaultdict
from semanticscholar import SemanticScholar

In [None]:
''' KM: Remove this fn from here. Dre to update GenerateCSVFile with method like this which handles no-doi items
def remove_blacklist_items(df_new_items):
    blacklisted_items = pd.read_csv("./script_data/blacklist.csv")
    initial_length = len(df_new_items)
    df_new_items = df_new_items[~df_new_items['ss_doi'].isin(blacklisted_items['doi'].unique().tolist())] # remove blacklisted dois
    df_new_items = df_new_items[~df_new_items['ss_id'].isin(blacklisted_items['ss_id'].unique().tolist())] # remove blacklisted dois

    print(f"{initial_length-len(df_new_items)} items removed from newly found items.")
    return df_new_items
'''

In [None]:
def get_item_to_blacklist(item): # item here is a row from the manually checked csv file
    #Add item to blacklist.csv
    move_to_blacklist = {
        'staff_id': item.get('staff_id', None),
        'staff_name': item.get('staff_id', None),
        'ss_year': item.get('ss_year', None),
        'ss_id': item.get('ss_id', None),
        'title': item.get('ss_title', None),
        'doi': item.get('ss_doi', None),
        'Should be in diag.bib': 'no',
        'Reason': item.get('Blacklist reason', None)
    }

    return move_to_blacklist

In [None]:
def update_blacklist_csv(blacklist_csv, blacklist_entries): #blacklist_csv is a df
    # Add all items to blacklist.csv
    blacklist_csv = pd.concat([blacklist_csv, pd.DataFrame(blacklist_entries)], ignore_index=True)

    # Save blacklist.csv
    blacklist_csv.to_csv('./script_data/blacklist.csv', index=False)
    return f"{len(blacklist_entries)} items added to blacklist"

In [None]:
# Code to get citations from semantic scholar. If there are multiple ss_ids, we should get the number of citations for each of them and sum the two (or more?) values.
def get_citations(semantic_scholar_ids):
    dict_cits = {}
    for ss_id in semantic_scholar_ids:
        sch = SemanticScholar()
        paper = sch.get_paper(ss_id)
        paper_id = paper['paperId']
        dict_cits[paper_id] = len(paper['citations'])
    return dict_cits

In [None]:
def get_bib_info(diag_bib_file, item): #diag_bib_file is the file read in as a string, item is row from csv
    #Get DOI information
    citations = 0
    # if isinstance(item['ss_id'], list):
    #     for ss_id in item['ss_id']:
    #         citations += get_citations(ss_id)
    # else:
    #     citations = get_citations(item['ss_id'])

    # make sure doi is not already in diag.bib
    if item['ss_doi'] in diag_bib_file:
        return None

    # Get BibLatex information based on DOI if not in the file
    reader = GetBiblatex(doi=item['ss_doi'], diag_bib=diag_bib_file, num_citations=citations)
    bibtext = reader.get_bib_text()

    # Return the bibtext if it is not 'empty', otherwise return None
    return bibtext if bibtext != 'empty' else None





In [None]:
def add_ss_id_to_existing_bibkey(diag_bib_raw, ss_id, bibkey):
    
    #Update bibkey with ss_id
    for ind, entry in enumerate(diag_bib_raw):
        if entry.type == 'string':
            continue
            
        print(entry.key)
        # if we found the relevant key
        if bibkey == entry.key:
            # if there is already something in all_ss_ids
            if 'all_ss_ids' in entry.fields.keys():
                if not entry.fields['all_ss_ids'] == '{' + str(ss_id) + '}': # this should never happen, right? (from Keelin!)
                    previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                    new = ss_id
                    combined = list(set(previous) | set([new]))
                    # update the entry
                    entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            # if there is no ss_id here yet just add this single one
            else:   
                    entry.fields['all_ss_ids'] = '{' + str(ss_id) + '}'
            # put the updated entry back into the list
            diag_bib_raw[ind] = entry
            print(str(ss_id), 'added to diag.bib')
            return diag_bib_raw
        
    # if we haven't returned by now then we failed to update 
    print('failed to add ss_id to diag.bib', str(ss_id), str(bibkey))
    return diag_bib_raw

In [None]:
def update_citation_count(path_diag_bib):
    
    diag_bib_raw = read_bibfile(None, path_diag_bib)
    for entry in diag_bib_raw:
        flag=0
        if entry.type == 'string':
            continue
        if 'all_ss_ids' in entry.fields:
            all_ss_ids = []
            ss_ids = entry.fields['all_ss_ids'].translate(str.maketrans('', '', string.punctuation)).split(' ')
            if len(ss_ids) > 1:
                all_ss_ids.extend(ss_ids)
            else:
                all_ss_ids.append(ss_ids[0])
            dict_cits = get_citations(all_ss_ids)
            n_cits = 0
            for key in dict_cits.keys():
                n_cits += dict_cits[key]
            if 'gscites' in entry.fields:
                if n_cits > int(entry.fields['gscites'].strip('{}')):
                    entry.fields['gscites'] = '{' + str(n_cits) + '}'
            else:
                entry.fields['gscites'] = '{' + str(n_cits) + '}'
    return diag_bib_raw

# Load csv files

In [None]:
# load manually_checked
manually_checked = pd.read_csv("./script_data/manual_check.csv")
# manually_checked = remove_blacklist_items(manually_checked)     # This should be done before actually manually checking

# POTENTIAL TO-DO CREATE ACTION MAPPINGS


In [None]:
# Iterate through all items in the manually checked csv
blacklist_items = []
items_to_add = ''
items_to_update = []

#TODO: Make sure new items or updated items in the bib-file include pmid and doi if they did not previously

for index, bib_item in manually_checked.iterrows():
    print(f"Working on {index}/{len(manually_checked)}")
    # Make sure item is manually checked
    if "," in bib_item['action']:
        print(f"{bib_item['ss_id']} has not been checked yet, make sure only 1 action is mentioned")
        continue
        #TODO: we will later work from a dropdown-list rather than a comma separated set of actions so this probably will need updating

    # Add new item to diag.bib
    elif "add new item" in bib_item['action']:
       bib_item_text = get_bib_info(diag_bib, bib_item)

       if bib_item_text is not None:
           items_to_add += bib_item_text
       else:
           print(f"Unable to gather information for {bib_item['ss_doi']}")
           # TO-DO APPEND items we were unable to add

    # Add ss_id to already existing doi in diag.bib
    elif "add ss_id" in bib_item['action']:
        # just store a list of these items for now and we will update the file at the end
        items_to_update += [bib_item]
        
    # Get items to blacklist
    elif "blacklist" in bib_item['action']:
        blacklist_item = get_item_to_blacklist(bib_item)
        blacklist_items.append(blacklist_item)

    # Get None items
    elif 'None' in bib_item['action']:
        continue

    # TODO: NOTIFY IF NOTHING IS DONE WITH AN ITEM (NO-MATCH)

# First we open the bib file, add the completely new bib entries and save it:
#Load diag.bib as a string
cwd = os.getcwd()
parent_directory = os.path.abspath(os.path.join(cwd, ".."))
#TODO: in the end when this script is routine this should just read the live diag.bib
diag_bib_path = os.path.join(parent_directory, 'scripts/script_data/diag_ss.bib')
with open(diag_bib_path, encoding="utf8") as bibtex_file:
    diag_bib = bibtex_file.read()
# append the new items to the string
diag_bib += items_to_add  
# save the file to disk (temporarily commented)
# with open(diag_bib_path, encoding="utf8") as bibtex_file:
#    bibtex_file.write(diag_bib)

# TO-DO FILE STILL HAVE TO BE SAVED
# TO-DO: RE-READ BIB FILE AND UPDATE SS_IDS LIST FROM CODE IN FOR LOOP

# Second we re-open the bib file using the read_bibfile method and update existing items with new ss_ids
diag_bib_raw = read_bibfile(None, diag_bib_path)
for item_to_update in items_to_update:
    diag_bib_raw=add_ss_id_to_existing_bibkey(diag_bib_raw, item_to_update["ss_id"], item_to_update["bibkey"])
# and save the bibfile with the newly added ss_ids (temporarily commented)
# save_to_file(diag_bib_raw, diag_bib_path)

# Third we update the blacklist (temporarily commented)
blacklist = pd.read_csv('./script_data/blacklist.csv')
# update_blacklist_csv(blacklist, blacklist_items)

# TODO: Here we should provide a report of rows where we did not know what to do or we failed to do the action
print("DONE with processing manually checked items")

In [None]:
count = items_to_add.count('{yes}')
print(f"Newly added items: {count}")

In [None]:
print(f"Blacklisted items: {len(blacklist_items)}")

# Update citation counts

In [None]:
path_diag_bib = os.path.join('script_data/', 'diag_ss.bib')
update_citation_count(path_diag_bib)
path_output_diag_bib = os.path.join('script_data/', 'diag_ss_new.bib')
save_to_file(diag_bib_raw, None, path_output_diag_bib)

In [None]:
a = "string"
b = None
if b is not None:
    c = a + b
    print(c)