In [37]:
import pandas as pd
import os
import string
from semanticscholar import SemanticScholar
from get_biblatex import GetBiblatex
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
from ast import literal_eval
from collections import defaultdict

In [2]:
def remove_blacklist_items(df_new_items):
    blacklisted_items = pd.read_csv("./script_data/blacklist.csv")
    initial_length = len(df_new_items)
    df_new_items = df_new_items[~df_new_items['ss_doi'].isin(blacklisted_items['doi'].unique().tolist())] # remove blacklisted dois
    df_new_items = df_new_items[~df_new_items['ss_id'].isin(blacklisted_items['ss_id'].unique().tolist())] # remove blacklisted dois

    print(f"{initial_length-len(df_new_items)} items removed from newly found items.")
    return df_new_items

In [3]:
def add_item_to_blacklist(item):
    #Read blacklist.csv
    blacklist_csv = pd.read_csv('./script_data/blacklist.csv')

    #Add item to blacklist.csv
    move_to_blacklist = {
        'staff_id': item.get('staff_id', None),
        'staff_name': item.get('staff_id', None),
        'ss_year': item.get('ss_year', None),
        'ss_id': item.get('ss_id', None),
        'title': item.get('ss_title', None),
        'doi': item.get('ss_doi', None),
        'Should be in diag.bib': 'no',
        'Reason': item.get('Blacklist reason', None)
    }

    blacklist_csv = pd.concat([blacklist_csv, pd.DataFrame([move_to_blacklist])], ignore_index=True)
    #Save blacklist.csv
    blacklist_csv.to_csv('./script_data/blacklist.csv', index=False)

    return f"{item['ss_id']}: added to blacklist.csv"

In [54]:
# Code to get citations from semantic scholar. If there are multiple ss_ids, we should get the number of citations for each of them and sum the two (or more?) values.
def get_citations(semantic_scholar_ids):
    dict_cits = {}
    for ss_id in semantic_scholar_ids:
        sch = SemanticScholar()
        paper = sch.get_paper(ss_id)
        paper_id = paper['paperId']
        dict_cits[paper_id] = len(paper['citations'])
    return dict_cits

In [5]:
cwd = os.getcwd()
parent_directory = os.path.abspath(os.path.join(cwd, ".."))
diag_bib_path = os.path.join(parent_directory, 'diag.bib')
with open(diag_bib_path, encoding="utf8") as bibtex_file:
    diag_bib = bibtex_file.read()

In [6]:
def add_item_to_diag_bib(item):
    #Load diag.bib
    cwd = os.getcwd()
    parent_directory = os.path.abspath(os.path.join(cwd, ".."))
    diag_bib_path = os.path.join(parent_directory, 'diag.bib')
    with open(diag_bib_path, encoding="utf8") as bibtex_file:
        diag_bib = bibtex_file.read()
    
    #Get DOI information
    citations = 0
    if isinstance(item['ss_id'], list):
        for ss_id in item['ss_id']:
            citations += get_citations(ss_id)
    else:
        citations = get_citations(item['ss_id'])
    # Update Dre's code and add an extra argument for citations so that it uses the correct number when creating the bib item?
    reader = GetBiblatex(doi=item['ss_doi'], diag_bib=diag_bib, num_citations=citations)
    bibtext = reader.get_bib_text()
    # Add information to diag.bib
    if bibtext != 'empty':
        diag_bib = diag_bib + bibtext
    #Save diag.bib

    return f"{item} added to diag.bib"

In [10]:
def add_ss_id_to_existing_bibkey(diag_bib_raw, ss_id, bibkey):
    
    #Update bibkey with ss_id
    for entry in diag_bib_raw:
        if entry.type == 'string':
            continue
        
        if bibkey == entry.key:
            if 'all_ss_ids' in entry.fields:
                if not entry.fields['all_ss_ids'] == '{' + str(ss_id) + '}':
                    previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                    new = ss_id
                    combined = list(set(previous) | set([new]))
                    entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            return f"{ss_id} added to diag.bib"
    return f"{bibkey} not found"

In [81]:
def update_citation_count(path_diag_bib):
    
    diag_bib_raw = read_bibfile(None, path_diag_bib)
    for entry in diag_bib_raw:
        flag=0
        if entry.type == 'string':
            continue
        if 'all_ss_ids' in entry.fields:
            all_ss_ids = []
            ss_ids = entry.fields['all_ss_ids'].translate(str.maketrans('', '', string.punctuation)).split(' ')
            if len(ss_ids) > 1:
                all_ss_ids.extend(ss_ids)
            else:
                all_ss_ids.append(ss_ids[0])
            dict_cits = get_citations(all_ss_ids)
            n_cits = 0
            for key in dict_cits.keys():
                n_cits += dict_cits[key]
            if 'gscites' in entry.fields:
                if n_cits > int(entry.fields['gscites'].strip('{}')):
                    entry.fields['gscites'] = '{' + str(n_cits) + '}'
            else:
                entry.fields['gscites'] = '{' + str(n_cits) + '}'
    return diag_bib_raw

In [None]:
manually_checked = pd.read_excel("./script_data/manual_check_20231018 (1).xlsx")
manually_checked = remove_blacklist_items(manually_checked)     # This should be done before actually manually checking

for index, bib_item in manually_checked.iterrows():
    # Make sure item is manually checked
    if "," in bib_item['action']:
        print(f"{bib_item['ss_id']} has not been checked yet, make sure only 1 action is mentioned")

    # Add new item to diag.bib
    #elif "add new item" in bib_item['action']:
    #    add_item_to_diag_bib(bib_item)

    # Add ss_id to already existing doi in diag.bib
    elif "add ss_id" in bib_item['action']:
        path_diag_bib = os.path.join('script_data/', 'diag_ss.bib')
        diag_bib_raw = read_bibfile(None, path_diag_bib)
        #assert bib_item['ss_doi'] == bib_item['bib_doi'], f"ss_doi and bib_doi should be equal for {bib_item['ss_id']}"
        result=add_ss_id_to_existing_bibkey(diag_bib_raw, bib_item["ss_id"], bib_item["bibkey"])
        print(result) 
        path_output_diag_bib = os.path.join('script_data/', 'diag_ss_new.bib')
        save_to_file(diag_bib_raw, None, path_output_diag_bib)
    elif "blacklist" in bib_item['action']:
        add_item_to_blacklist(bib_item)

path_diag_bib = os.path.join('script_data/', 'diag_ss.bib')
update_citation_count(path_diag_bib)
path_output_diag_bib = os.path.join('script_data/', 'diag_ss_new.bib')
save_to_file(diag_bib_raw, None, path_output_diag_bib)
print("DONE")

70 items removed from newly found items.
14198a817d2c0a800bfb0a0a36baf5097fe22054 added to diag.bib
2125835bf1c4fd0646b5dd50855d647044c07658 added to diag.bib
24b8cee45431f633d2fa6e3c05670f62b1e41e7e added to diag.bib
2723ce1686eea776df179e362cd9a8b8e2bb7ff1 added to diag.bib
2a7dafe1287670068300ff77401923f7e151b9f4 added to diag.bib
2fe9af8a6b41fc9db41e76621f037aac453ab433 added to diag.bib
349ca29f588b9c785085da7147a4b58df032a8bf added to diag.bib
374f4a7676183c95f901e655f2caf170cdd9ec9d added to diag.bib
37e383517c34818ad049af0aa763ad5906e9f51a added to diag.bib
44454c9090606d0332272ff38df6c87eac15f5f7 added to diag.bib
4f016eb85905d24ce82adeee28bdf66b46870c9e added to diag.bib
740d5d34fcd714870ddf0073fd8956db023319f0 added to diag.bib
81e0cd2d7df521651ce7a093409b02375ef05787 has not been checked yet, make sure only 1 action is mentioned
82eb0252d2239563a34b29e32b75f55f4e35536f has not been checked yet, make sure only 1 action is mentioned
83a8284194d62305f0dae376417dcaa540f4fd53 ha

e844b6b027c94468de8a607497f95a3771b7d48b added to diag.bib
f131ef217543d179269018950bf3b6ba2b30f3b1 added to diag.bib
f643f4a927cf65f1ec66231ae76d3bc1736a67d3 added to diag.bib
