In [1]:
# !pip install pyarxiv
# !pip install pybtex
# !pip install dropbox
# !pip install tqdm
# !pip install pycolors2
# !pip install pdf2image

# Libraries

In [2]:
from pathlib import Path
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
import pandas as pd
import os
import numpy as np
from ast import literal_eval

# Imports

## DIAG bib

In [43]:
path_diag_bib = os.path.join('..', 'diag.bib')
path_output_diag_bib = os.path.join('..', 'diag_taverne.bib')

In [44]:
diag_bib_raw = read_bibfile(None, path_diag_bib) # I changed the code in such a way that IF I give a second argument, it uses the second argument as a full path

## SS match data

### multiple_bibkeys_maybe_multiple_ss_ids
- bibkey : many
- ss_id: none, one, many 

In [45]:
df_matches_many = pd.read_csv('script_data/TEMP_LOOKUP_multiple_bibkeys_maybe_multiple_ss_ids.csv')
# df_matches_multiple.head(3)

In [46]:
many_bibkeys = []
for str_list in list(df_matches_many['up80_bibkeys']):
    many_bibkeys.extend(literal_eval(str_list))
many_bibkeys[:5]

['Adri09', 'Adri11c', 'Anto21', 'Anto22', 'Anto21']

### matches_single_bibkey_with_none_one_or_many_ss_ids
- bibkey: one
- ss_id: none, one, many

In [53]:
df_matches_one = pd.read_csv('script_data/TEMP_LOOKUP_matches_single_bibkey_with_none_one_or_many_ss_ids.csv')
# df_matches_one = df_matches_one[~df_matches_one['bibkey'].isin(many_bibkeys)]
# df_matches_one = df_matches_one[[col for col in df_matches_one.columns if col != "Unnamed: 0"]]

In [56]:
len(df_matches_one)

121

In [54]:
df_matches_one.head()

Unnamed: 0.1,Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year
0,3,Abel19,article,"Computational pathology definitions, best prac...","Abels, Esther and Pantanowitz, Liron and Aeffn...",,,,2019.0
1,19,Amga20,article,Report on computational assessment of Tumor In...,Mohamed Amgad and and Elisabeth Specht Stovgaa...,b4c4c3dc91d42114023b0575c3e2273b87446ff7,"['69999230b02054b82254684a73bb8a4c83878d28', '...",[ss_ids: same DOI] title matching: single bibk...,2020.0
2,21,Anto21,article,The Medical Segmentation Decathlon,Michela Antonelli and Annika Reinke and Spyrid...,,,,2021.0
3,22,Anto22,article,The {Medical} {Segmentation} {Decathlon,"Antonelli, Michela and Reinke, Annika and Baka...",979a9f247700d00ff2c3f0612d5eb001379f93c8,"['979a9f247700d00ff2c3f0612d5eb001379f93c8', '...",[ss_ids: same DOI] multiple doi matches ['979a...,2022.0
4,26,Ares18,article,iW-Net}: an automatic and minimalistic interac...,Guilherme Aresta and Colin Jacobs and Teresa A...,,,,2018.0


In [55]:
sum(df_matches_one['ss_id'].notna())

39

# Add ss_ids to DAIG bib

#### Logic:
Loop over entries:
- Skip Journal strings
- Look up entry key (bibkey) in df_matches_one
    - If bibkey not in df, skip
- Check if there are SS matches for this bibkey
    - If so, add this row's 'all_ss_ids' column to the entry

In [49]:
for entry in diag_bib_raw:
    ### skip Journal strings
    if entry.type == 'string':
        continue
    
    # Get data
    bibkey = entry.key
    match_row = df_matches_one[df_matches_one['bibkey'] == bibkey]
    
    # Double checks
    if len(match_row) == 0:
        continue
    assert len(match_row) == 1, 'multiple rows should not be possible in this DF'
    match_row = match_row.iloc[0]
    
    # If there is a match
    if pd.notna(match_row['all_ss_ids']):
        ## SS ID
        # If the field is already filled, check if the same
        if 'ss_id' in entry.fields:
            if not entry.fields['ss_id'] == '{' + match_row['ss_id'] + '}':
                print('ss_id data not the same\n', bibkey, entry.fields['ss_id'], '{' + match_row['ss_id'] + '}')
                break
        # Else fill field
        else: 
            entry.fields['ss_id'] = '{' + match_row['ss_id'] + '}'
            
        ## SS ID
        # If the field is already filled, check if the same
        if 'all_ss_ids' in entry.fields:
            if not entry.fields['all_ss_ids'] == '{' + match_row['all_ss_ids'] + '}':
                print('all_ss_ids data not the same\n', bibkey, entry.fields['all_ss_ids'], '\n', match_row)
                break
        # Else fill field
        else: 
            entry.fields['all_ss_ids'] = '{' + match_row['all_ss_ids'] + '}'
    
    
#     if entry.type == 'phdthesis':
#         entry.fields['journal'] = '{PhD thesis}'
#     if entry.type == 'mastersthesis':
#         entry.fields['journal'] = '{Master thesis}'
#     else:
#         pass

In [51]:
count = 0
for entry in diag_bib_raw:
    if 'ss_id' in entry.fields:
        count += 1
print(count)

36


# Save new diag_taverne.bib

In [58]:
path_output_diag_bib = os.path.join('script_data', 'diag_ss.bib')

In [59]:
save_to_file(diag_bib_raw, None, path_output_diag_bib)