In [2]:
import pandas as pd
import os
import string
from semanticscholar import SemanticScholar
from get_biblatex import GetBiblatex
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
from ast import literal_eval
from collections import defaultdict

In [3]:
def remove_blacklist_items(df_new_items):
    blacklisted_items = pd.read_csv("./script_data/blacklist.csv")
    initial_length = len(df_new_items)
    df_new_items = df_new_items[~df_new_items['ss_doi'].isin(blacklisted_items['doi'].unique().tolist())] # remove blacklisted dois
    df_new_items = df_new_items[~df_new_items['ss_id'].isin(blacklisted_items['ss_id'].unique().tolist())] # remove blacklisted dois

    print(f"{initial_length-len(df_new_items)} items removed from newly found items.")
    return df_new_items

In [4]:
def get_item_to_blacklist(item):
    #Add item to blacklist.csv
    move_to_blacklist = {
        'staff_id': item.get('staff_id', None),
        'staff_name': item.get('staff_id', None),
        'ss_year': item.get('ss_year', None),
        'ss_id': item.get('ss_id', None),
        'title': item.get('ss_title', None),
        'doi': item.get('ss_doi', None),
        'Should be in diag.bib': 'no',
        'Reason': item.get('Blacklist reason', None)
    }

    return move_to_blacklist

In [5]:
def update_blacklist_csv(blacklist_csv, blacklist_entries):
    # Add all items to blacklist.csv
    blacklist_csv = pd.concat([blacklist_csv, pd.DataFrame(blacklist_entries)], ignore_index=True)

    # Save blacklist.csv
    blacklist_csv.to_csv('./script_data/blacklist.csv', index=False)
    return f"{len(blacklist_entries)} items added to blacklist"

In [6]:
# Code to get citations from semantic scholar. If there are multiple ss_ids, we should get the number of citations for each of them and sum the two (or more?) values.
def get_citations(semantic_scholar_ids):
    dict_cits = {}
    for ss_id in semantic_scholar_ids:
        sch = SemanticScholar()
        paper = sch.get_paper(ss_id)
        paper_id = paper['paperId']
        dict_cits[paper_id] = len(paper['citations'])
    return dict_cits

In [13]:
def get_bib_info(diag_bib_file, item):
    #Get DOI information
    citations = 0
    # if isinstance(item['ss_id'], list):
    #     for ss_id in item['ss_id']:
    #         citations += get_citations(ss_id)
    # else:
    #     citations = get_citations(item['ss_id'])

    # make sure doi is not already in diag.bib
    if item['ss_doi'] in diag_bib_file:
        return None

    # Get BibLatex information based on DOI if not in the file
    reader = GetBiblatex(doi=item['ss_doi'], diag_bib=diag_bib_file, num_citations=citations)
    bibtext = reader.get_bib_text()

    # Return the bibtext if it is not 'empty', otherwise return None
    return bibtext if bibtext != 'empty' else None





In [8]:
def add_ss_id_to_existing_bibkey(diag_bib_raw, ss_id, bibkey):
    
    #Update bibkey with ss_id
    for entry in diag_bib_raw:
        if entry.type == 'string':
            continue
        
        if bibkey == entry.key:
            if 'all_ss_ids' in entry.fields:
                if not entry.fields['all_ss_ids'] == '{' + str(ss_id) + '}':
                    previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                    new = ss_id
                    combined = list(set(previous) | set([new]))
                    entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            return f"{ss_id} added to diag.bib"
    return f"{bibkey} not found"

In [9]:
def update_citation_count(path_diag_bib):
    
    diag_bib_raw = read_bibfile(None, path_diag_bib)
    for entry in diag_bib_raw:
        flag=0
        if entry.type == 'string':
            continue
        if 'all_ss_ids' in entry.fields:
            all_ss_ids = []
            ss_ids = entry.fields['all_ss_ids'].translate(str.maketrans('', '', string.punctuation)).split(' ')
            if len(ss_ids) > 1:
                all_ss_ids.extend(ss_ids)
            else:
                all_ss_ids.append(ss_ids[0])
            dict_cits = get_citations(all_ss_ids)
            n_cits = 0
            for key in dict_cits.keys():
                n_cits += dict_cits[key]
            if 'gscites' in entry.fields:
                if n_cits > int(entry.fields['gscites'].strip('{}')):
                    entry.fields['gscites'] = '{' + str(n_cits) + '}'
            else:
                entry.fields['gscites'] = '{' + str(n_cits) + '}'
    return diag_bib_raw

# Load csv files

In [10]:
# load manually_checked
manually_checked = pd.read_csv("./script_data/manual_check.csv")
# manually_checked = remove_blacklist_items(manually_checked)     # This should be done before actually manually checking
# POTENTIAL TO-DO CREATE ACTION MAPPINGS

#Load diag.bib
cwd = os.getcwd()
parent_directory = os.path.abspath(os.path.join(cwd, ".."))
diag_bib_path = os.path.join(parent_directory, 'diag_ss.bib')
with open(diag_bib_path, encoding="utf8") as bibtex_file:
    diag_bib = bibtex_file.read()

# Load blacklist
blacklist = pd.read_csv('./script_data/blacklist.csv')


63 items removed from newly found items.


In [14]:
# Iterate through all items in the manually checked csv
blacklist_items = []
items_to_add = ''
for index, bib_item in manually_checked.iterrows():
    print(f"Working on {index}/{len(manually_checked)}")
    # Make sure item is manually checked
    if "," in bib_item['action']:
        print(f"{bib_item['ss_id']} has not been checked yet, make sure only 1 action is mentioned")
        continue

    # Add new item to diag.bib
    elif "add new item" in bib_item['action']:
       bib_item_text = get_bib_info(diag_bib, bib_item)

       if bib_item_text is not None:
           items_to_add += bib_item_text
       else:
           print(f"Unable to gather information for {bib_item['ss_doi']}")
           # TO-DO APPEND items we were unable to add

    # Add ss_id to already existing doi in diag.bib
    elif "add ss_id" in bib_item['action']:
        path_diag_bib = os.path.join('script_data/', 'diag_ss.bib')
        diag_bib_raw = read_bibfile(None, path_diag_bib)
        #assert bib_item['ss_doi'] == bib_item['bib_doi'], f"ss_doi and bib_doi should be equal for {bib_item['ss_id']}"
        result=add_ss_id_to_existing_bibkey(diag_bib_raw, bib_item["ss_id"], bib_item["bibkey"])
        print(result)
        path_output_diag_bib = os.path.join('script_data/', 'diag_ss_new.bib') # TO-DO SAVE FILE ONLY AT THE END
        save_to_file(diag_bib_raw, None, path_output_diag_bib)  # TO-DO APPEND ITEMS THAT REQUIRE SS_ID UPDATES

    # Get items to blacklist
    elif "blacklist" in bib_item['action']:
        blacklist_item = get_item_to_blacklist(bib_item)
        blacklist_items.append(blacklist_item)

    # Get None items
    elif 'None' in bib_item['action']:
        continue

    # NOTIFY IF NOTHING IS DONE WITH AN ITEM (NO-MATCH)

# diag_bib += items_to_add              # TO-DO FILE STILL HAVE TO BE SAVED
# TO-DO: RE-READ BIB FILE AND UPDATE SS_IDS LIST FROM CODE IN FOR LOOP


# Update blacklist
# update_blacklist_csv(blacklist, blacklist_items)
print("DONE with processing manually checked items")

Working on 0/217
Working on 1/217
Working on 2/217
Working on 4/217
Unable to generate bibtext for 10.1038/s41585-020-0324-x
'family'
Unable to gather information for 10.1038/s41585-020-0324-x
Working on 6/217
Working on 8/217
Working on 9/217
Unable to generate bibtext for 10.23698/AIDA/BRLN
'published'
Unable to gather information for 10.23698/AIDA/BRLN
Working on 10/217
Working on 11/217
Unable to generate bibtext for 10.1093/cid/ciac623
'family'
Unable to gather information for 10.1093/cid/ciac623
Working on 12/217
20
Working on 13/217
Working on 14/217
308
Working on 15/217
Unable to generate bibtext for 10.48550/arXiv.2112.05151
'published'
Unable to gather information for 10.48550/arXiv.2112.05151
Working on 16/217
Unable to generate bibtext for 10.1093/rheumatology/keab835
'family'
Unable to gather information for 10.1093/rheumatology/keab835
Working on 17/217
Unable to generate bibtext for 10.1001/jamaophthalmol.2021.1407
'family'
Unable to gather information for 10.1001/jamao

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Working on 83/217
8
Working on 84/217
Working on 85/217
Working on 86/217
Working on 87/217
Working on 88/217
Working on 90/217
Working on 91/217
Working on 92/217
Working on 93/217
Working on 95/217
Working on 96/217
Working on 97/217
Working on 98/217
Working on 99/217
Working on 100/217
Working on 101/217
Working on 102/217
Working on 103/217
Working on 104/217
Working on 105/217
16
Working on 107/217
Working on 109/217
Working on 111/217
Working on 112/217
Working on 114/217
Working on 116/217
17
Working on 118/217
Working on 119/217
Unable to generate bibtext for 10.1002/mrm.29371
'family'
Unable to gather information for 10.1002/mrm.29371
Working on 120/217
Unable to generate bibtext for 10.3390/jpm11070663
'family'
Unable to gather information for 10.3390/jpm11070663
Working on 121/217
Unable to generate bibtext for 10.48550/arXiv.2305.05984
'published'
Unable to gather information for 10.48550/arXiv.2305.05984
Working on 122/217
Working on 125/217
Working on 126/217
Working on 

In [20]:
count = items_to_add.count('{yes}')
print(f"Newly added items: {count}")

Newly added items: 94


In [25]:
print(f"Blacklisted items: {len(blacklist_items)}")

Blacklisted items: 16


# Update citation counts

In [None]:
path_diag_bib = os.path.join('script_data/', 'diag_ss.bib')
update_citation_count(path_diag_bib)
path_output_diag_bib = os.path.join('script_data/', 'diag_ss_new.bib')
save_to_file(diag_bib_raw, None, path_output_diag_bib)

70 items removed from newly found items.
14198a817d2c0a800bfb0a0a36baf5097fe22054 added to diag.bib
2125835bf1c4fd0646b5dd50855d647044c07658 added to diag.bib
24b8cee45431f633d2fa6e3c05670f62b1e41e7e added to diag.bib
2723ce1686eea776df179e362cd9a8b8e2bb7ff1 added to diag.bib
2a7dafe1287670068300ff77401923f7e151b9f4 added to diag.bib
2fe9af8a6b41fc9db41e76621f037aac453ab433 added to diag.bib
349ca29f588b9c785085da7147a4b58df032a8bf added to diag.bib
374f4a7676183c95f901e655f2caf170cdd9ec9d added to diag.bib
37e383517c34818ad049af0aa763ad5906e9f51a added to diag.bib
44454c9090606d0332272ff38df6c87eac15f5f7 added to diag.bib
4f016eb85905d24ce82adeee28bdf66b46870c9e added to diag.bib
740d5d34fcd714870ddf0073fd8956db023319f0 added to diag.bib
81e0cd2d7df521651ce7a093409b02375ef05787 has not been checked yet, make sure only 1 action is mentioned
82eb0252d2239563a34b29e32b75f55f4e35536f has not been checked yet, make sure only 1 action is mentioned
83a8284194d62305f0dae376417dcaa540f4fd53 ha

e844b6b027c94468de8a607497f95a3771b7d48b added to diag.bib
f131ef217543d179269018950bf3b6ba2b30f3b1 added to diag.bib
f643f4a927cf65f1ec66231ae76d3bc1736a67d3 added to diag.bib


In [1]:
a = "string"
b = None
if b is not None:
    c = a + b
    print(c)

TypeError: can only concatenate str (not "NoneType") to str