In [1]:
!pip3 install pyarxiv
!pip3 install pybtex
!pip3 install dropbox
!pip3 install tqdm
!pip3 install pycolors2
!pip3 install pdf2image
!pip3 install Unidecode



In [7]:
import os
import requests
import pandas as pd 
import string
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
from difflib import SequenceMatcher
from collections import defaultdict
from datetime import datetime

In [8]:
staff_id_dict = {'Bram van Ginneken': [8038506, 123637526],
'Francesco Ciompi': [143613202],
'Alessa Hering': [153744566],
'Henkjan Huisman': [34754023],
'Colin Jacobs': [2895994],
'Peter Koopmans': [34726383],
'Jeroen van der Laak': [145441238, 145388932],
'Geert Litjens': [145959882],
'James Meakin': [4960344],
'Keelin Murphy': [35730362],
'Ajay Patel': [2109170880, 2116215861],
'Cornelia Schaefer-Prokop': [1419819133, 1445069528, 1400632685],
'Matthieu Rutten': [2074975080, 2156546],
'Jos Thannhauser': [5752941],
"Bram Platel" : [1798137], 
"Nico Karssemeijer" : [1745574], 
"Clarisa Sanchez" : [144085811, 32187701], 
"Nikolas Lessman" : [2913408], 
"Jonas Teuwen" : [32649341, 119024451], 
"Rashindra Manniesing" : [2657081],
"Nadieh Khalili": [144870959]}

In [9]:
staff_year_dict = {
'Bram van Ginneken':  {'start' : 1996, 'end': 9999},
'Francesco Ciompi':  {'start' : 2013, 'end': 9999},
'Alessa Hering':  {'start' : 2018, 'end': 9999},
'Henkjan Huisman':  {'start' : 1992, 'end': 9999},
'Colin Jacobs':  {'start' : 2010, 'end': 9999},
'Peter Koopmans':  {'start' : 2022, 'end': 9999},
'Jeroen van der Laak':  {'start' : 1991, 'end': 9999},
'Geert Litjens':  {'start' : 2016, 'end': 9999},
'James Meakin':  {'start' : 2017, 'end': 9999},
'Keelin Murphy':  {'start' : 2018, 'end': 9999},
'Ajay Patel':  {'start' : 2015, 'end': 9999},
'Cornelia Schaefer-Prokop':  {'start' : 2010, 'end': 9999},
'Matthieu Rutten':  {'start' : 2019, 'end': 9999},
'Jos Thannhauser': {'start' : 2022, 'end': 9999},
"Bram Platel" : {'start' : 2010,  'end' : 2019},
"Nico Karssemeijer" : {'start' : 1989, 'end' : 2022}, 
"Clarisa Sanchez" : {'start' : 2008, 'end' : 2021}, 
"Nikolas Lessman" : {'start' : 2019, 'end' : 2022}, 
"Jonas Teuwen" : {'start' : 2017, 'end' : 2020}, 
"Rashindra Manniesing" : {'start' : 2010, 'end' : 2021},
"Nadieh Khalili" : : {'start' : 2023, 'end' : 9999}
}

In [None]:
def remove_blacklist_items(df_new_items):
    blacklisted_items = pd.read_csv("./script_data/blacklist.csv")
    initial_length = len(df_new_items)
    df_new_items = df_new_items[~df_new_items['ss_id'].isin(blacklisted_items['ss_id'].unique().tolist())] # remove blacklisted dois
    df_new_items = df_new_items[~df_new_items['ss_doi'].isin(blacklisted_items['doi'].unique().tolist()) | df_new_items['ss_doi'].isna()] # remove blacklisted dois

    print(f"{initial_length-len(df_new_items)} items removed from newly found items.")
    return df_new_items

In [10]:
def from_bib_to_csv(diag_bib_raw):
    bib_data = []
    bib_columns = ['bibkey', 'type', 'title', 'authors', 'doi', 'gs_citations', 'journal', 'year', 'all_ss_ids', 'pmid']
    
    for bib_entry in diag_bib_raw:
        if bib_entry.type == 'string':
            continue

        bibkey = bib_entry.key
        bib_type = bib_entry.type
        fields = bib_entry.fields
        
        bib_authors = fields.get('author', '').strip('{}')
        bib_title = fields.get('title', '').strip('{}')
        bib_doi = fields.get('doi', '').strip('{}')
        bib_gscites = fields.get('gscites', '').strip('{}')
        bib_journal = fields.get('journal', '').strip('{}')
        bib_year = fields.get('year', '').strip('{}')
        bib_all_ss_ids = fields.get('all_ss_ids', '').strip('{}')
        bib_pmid = fields.get('pmid', '').strip('{}')
        
        bib_data.append([bibkey, bib_type, bib_title, bib_authors, bib_doi, bib_gscites, bib_journal, bib_year, bib_all_ss_ids, bib_pmid])

    df_bib_data = pd.DataFrame(bib_data, columns=bib_columns)
    return df_bib_data

In [11]:
def find_new_ssids():
    staff_dict = {key: {'ids': staff_id_dict[key], 'years': staff_year_dict[key]} for key in staff_id_dict}
    all_staff_id_ss_data = []

    for idx, (staff_name, values) in enumerate(staff_dict.items()):
        staff_ids = values['ids']
        staff_start = values['years']['start']
        staff_end = values['years']['end']
        print(f'[{idx + 1}/{len(staff_id_dict)}]: {staff_name}')

        for staff_id in staff_ids:
            print('\t\t', staff_id)
            staff_id_ss_data = []

            url = f'https://api.semanticscholar.org/graph/v1/author/{staff_id}/papers?fields=year,title,authors,externalIds,citationCount,publicationTypes,journal&limit=500'
            r = requests.get(url)
            ss_staff_data = r.json().get('data', [])

            for ss_staff_entry in ss_staff_data:
                ss_id = ss_staff_entry.get('paperId')
                ss_title = ss_staff_entry.get('title')
                ss_doi = ss_staff_entry['externalIds'].get('DOI')
                ss_citations = ss_staff_entry.get('citationCount')
                ss_year = ss_staff_entry.get('year')
                pmid = ss_staff_entry['externalIds'].get('PubMed')
                authors = ' and '.join([author['name'] for author in ss_staff_entry.get('authors', [])])
                ss_journal = ss_staff_entry['journal'].get('name') if ss_staff_entry['journal'] and 'name' in ss_staff_entry['journal'] else None
                
                        
                if ss_year != None:
                    ss_year = int(ss_year)
                    if not staff_start <= ss_year <= staff_end:
                    # probably doesnt belong to DIAG, still captured via another staff member if also in the same paper
                        continue
                staff_id_ss_data.append([staff_id, staff_name, staff_start, staff_end, ss_year, ss_id, ss_title, ss_doi, ss_citations, pmid, authors, ss_journal])
                
            all_staff_id_ss_data.extend(staff_id_ss_data)

    ss_columns = ['staff_id', 'staff_name', 'staff_from', 'staff_till', 'ss_year', 'ss_id', 'title', 'doi', 'ss_citations', 'pmid', 'authors', 'journal']
    df_all_staff_id_ss_data = pd.DataFrame(all_staff_id_ss_data, columns=ss_columns)

    print('DONE')
    return df_all_staff_id_ss_data

In [12]:
def return_existing_ssids(bib_file):
    all_ss_ids=[]
    for entry in bib_file:
        if entry.type == 'string':
            continue
        if 'all_ss_ids' in entry.fields:
            ss_ids = entry.fields['all_ss_ids'].translate(str.maketrans('', '', string.punctuation)).split(' ')
            if len(ss_ids) > 1:
                all_ss_ids.extend(ss_ids)
            else:
                all_ss_ids.append(ss_ids[0])
    return all_ss_ids

In [13]:
path_diag_bib = os.path.join('script_data/', 'diag_ss.bib')
diag_bib_raw = read_bibfile(None, path_diag_bib) # I changed the code in such a way that IF I give a second argument, it uses the second argument as a full path
#Open the bibfile as pandas dataframe
df_bib = from_bib_to_csv(diag_bib_raw)

In [14]:
# Find items from semantic scholar
df_found_items = find_new_ssids()

[1/20]: Bram van Ginneken
		 8038506
		 123637526
[2/20]: Francesco Ciompi
		 143613202
[3/20]: Alessa Hering
		 153744566
[4/20]: Henkjan Huisman
		 34754023
[5/20]: Colin Jacobs
		 2895994
[6/20]: Peter Koopmans
		 34726383
[7/20]: Jeroen van der Laak
		 145441238
		 145388932
[8/20]: Geert Litjens
		 145959882
[9/20]: James Meakin
		 4960344
[10/20]: Keelin Murphy
		 35730362
[11/20]: Ajay Patel
		 2109170880
		 2116215861
[12/20]: Cornelia Schaefer-Prokop
		 1419819133
		 1445069528
		 1400632685
[13/20]: Matthieu Rutten
		 2074975080
		 2156546
[14/20]: Jos Thannhauser
		 5752941
[15/20]: Bram Platel
		 1798137
[16/20]: Nico Karssemeijer
		 1745574
[17/20]: Clarisa Sanchez
		 144085811
		 32187701
[18/20]: Nikolas Lessman
		 2913408
[19/20]: Jonas Teuwen
		 32649341
		 119024451
[20/20]: Rashindra Manniesing
		 2657081
DONE


In [15]:
# Remove duplicates
df_found_items = df_found_items.drop_duplicates(subset=['ss_id'])
# Remove items prior to 2015
df_found_items = df_found_items[df_found_items['ss_year']>=2015]

In [16]:
found_items = df_found_items['ss_id'].tolist()
found_dois = df_found_items['doi'].tolist()

In [17]:
existing_items = return_existing_ssids(diag_bib_raw)

In [21]:
columns = ['bibkey', 'ss_id', 'url', 'match score', 'bib_doi', 'ss_doi', 'bib_title', 'ss_title', 'staff_id', 'staff_name', 'bib_authors', 'ss_authors', 'bib_journal', 'ss_journal', 'bib_year', 'ss_year', 'bib_type', 'ss_pmid', 'reason', 'action']
actions_list = '[add ss_id, blacklist ss_id, add new item, add manually, None]'
list_doi_match = []
not_new = []
ss_id_match = []
for index, row in df_bib.iterrows():
    doi = row[4]
    ss_ids = row[8]
    all_ss_ids = []
    if ss_ids is not None:
        all_ss_ids = ss_ids.split(',')
        for i, el in enumerate(all_ss_ids):
            all_ss_ids[i] = el.translate(str.maketrans('', '', string.punctuation)).strip()
    
    # Check if any existing bib-item has the same ss_id as an item on found_items -> is this correct? what if that ss_id has to be linked to another item as well?
    for ss_id in all_ss_ids:
        if ss_id in found_items:
            not_new.append(ss_id)
        
    # Check if any existing bib-item has the same doi as an item on found_items
    if doi is not None and doi in found_dois:
        idx = found_dois.index(doi)
        ss_id = found_items[idx]
        # Check if that bib-item is already linked with the ss_id
        if ss_id not in all_ss_ids:
            pmid=df_found_items[df_found_items['ss_id'] ==ss_id]['pmid'].item()
            ss_title=df_found_items[df_found_items['ss_id']==ss_id]['title'].item()
            ss_authors = df_found_items[df_found_items['ss_id']==ss_id]['authors'].item()
            ss_journal = df_found_items[df_found_items['ss_id']==ss_id]['journal'].item()
            ss_year = int(df_found_items[df_found_items['ss_id']==ss_id]['ss_year'].item())
            staff_id = int(df_found_items[df_found_items['ss_id']==ss_id]['staff_id'].item())
            staff_name = df_found_items[df_found_items['ss_id']==ss_id]['staff_name'].item()
            ratio = SequenceMatcher(a=ss_title,b=row[2]).ratio()
            ss_id_match.append(ss_id)
            list_doi_match.append((row[0], ss_id, 'https://www.semanticscholar.org/paper/'+ss_id, ratio, doi, doi, row[2], ss_title, staff_id, staff_name, row[3], ss_authors, row[6], ss_journal, row[7], ss_year, row[1], pmid, 'doi match', actions_list))

In [51]:
for m in not_new:
    title=df_found_items[df_found_items['ss_id']==m]['title'].item()
    title_bib = ''
    for index, row in df_bib.iterrows():
        all_ss_ids = []
        ss_ids = row[8]
        if ss_ids is not None:
            all_ss_ids = ss_ids.split(',')
            for i, el in enumerate(all_ss_ids):
                all_ss_ids[i] = el.translate(str.maketrans('', '', string.punctuation)).strip()
        if m in all_ss_ids:
            title_bib = row[2]
            ratio = SequenceMatcher(a=title,b=title_bib).ratio()
            if ratio < 0.9:
                print(title_bib)
                print(title)
                print(ratio)
                print('***')

In [23]:
# Remove ss_ids that are already in bibfile and ss_id with doi match
to_add = set(found_items)-set(not_new)-set(ss_id_match)
new_items = df_found_items[df_found_items['ss_id'].isin(to_add)]

In [24]:
# Remove blacklist items
blacklist = pd.read_csv('script_data/blacklist.csv')
new_items = new_items[~new_items['doi'].isin(blacklist['doi'].unique().tolist())]

In [25]:
dois = new_items['doi'].tolist()
ss_ids = new_items['ss_id'].tolist()

In [39]:
# title match
titles = new_items['title'].tolist()
dois = new_items['doi'].tolist()
ss_ids = new_items['ss_id'].tolist()
list_title_match = []
list_no_dois = []
list_to_add = []

for ss_id, ss_title, doi in zip(ss_ids, titles, dois):
    title_match_ratios = df_bib['title'].apply(lambda x: SequenceMatcher(
            a=ss_title.lower(), 
            b=x.lower().replace('{', '').replace('}', '')).ratio())
    max_ratio = title_match_ratios.max()
    max_bibkey = df_bib[title_match_ratios==max_ratio]['bibkey'].iloc[0]
    max_bib_title = df_bib[title_match_ratios==max_ratio]['title'].iloc[0]
    max_bib_title = max_bib_title.replace('{', '').replace('}', '')
    if sum(title_match_ratios>0.8) >= 1:
        up80_bib_entries = df_bib[title_match_ratios > 0.8]
        for i, match in up80_bib_entries.iterrows():
            list_title_match.append((
                match['bibkey'],
                ss_id,
                f'https://www.semanticscholar.org/paper/{ss_id}',
                title_match_ratios[i],
                match['doi'],
                doi,
                match['title'].replace('{', '').replace('}', ''),
                ss_title,
                new_items[new_items['ss_id'] == ss_id]['staff_id'].item(),
                new_items[new_items['ss_id'] == ss_id]['staff_name'].item(),
                match['authors'],
                new_items[new_items['ss_id'] == ss_id]['authors'].item(),
                match['journal'],
                new_items[new_items['ss_id'] == ss_id]['journal'].item(),
                match['year'],
                new_items[new_items['ss_id'] == ss_id]['ss_year'].item(),
                match['type'],
                new_items[new_items['ss_id'] == ss_id]['pmid'].item(),
                'title match', actions_list))
    else:
        max_bib_entry = df_bib[title_match_ratios == max_ratio]
        authors = max_bib_entry['authors'].iloc[0]
        bib_doi = max_bib_entry['doi'].iloc[0]
        max_bib_journal = max_bib_entry['journal'].iloc[0]
        max_bib_year = max_bib_entry['year'].iloc[0]
        type_article = max_bib_entry['type'].iloc[0]
        
        ss_authors = new_items[new_items['ss_id'] == ss_id]['authors'].item()
        staff_id = new_items[new_items['ss_id'] == ss_id]['staff_id'].item()
        staff_name = new_items[new_items['ss_id'] == ss_id]['staff_name'].item()
        ss_journal = new_items[new_items['ss_id'] == ss_id]['journal'].item()
        ss_year = new_items[new_items['ss_id'] == ss_id]['ss_year'].item()
        ss_pmid = new_items[new_items['ss_id'] == ss_id]['pmid'].item()
        
        if doi is None:
            list_no_dois.append((max_bibkey, ss_id, f'https://www.semanticscholar.org/paper/{ss_id}', max_ratio, bib_doi, doi, max_bib_title, ss_title, staff_id, staff_name, authors, ss_authors, max_bib_journal, ss_journal, max_bib_year, ss_year, type_article, ss_pmid, 'doi None', actions_list))
        else:
            list_to_add.append((max_bibkey, ss_id, f'https://www.semanticscholar.org/paper/{ss_id}', max_ratio, bib_doi, doi, max_bib_title, ss_title, staff_id, staff_name, authors, ss_authors, max_bib_journal, ss_journal, max_bib_year, ss_year, type_article, ss_pmid, 'new item', actions_list))

In [40]:
total_list = list_to_add + list_no_dois + list_title_match + list_doi_match

In [41]:
df=pd.DataFrame(total_list, columns=columns)
current_date = datetime.now().strftime("%Y%m%d")
file_name = f'script_data/manual_check_{current_date}.xlsx'


# Remove blacklist items from manual check

In [1]:
df = remove_blacklist_items(df)

3 items removed from newly found items.


In [58]:
df=df.sort_values(['ss_id'])

In [59]:
df.to_excel(file_name, index=False)