In [1]:
# !pip install pyarxiv
# !pip install pybtex
# !pip install dropbox
# !pip install tqdm
# !pip install pycolors2
# !pip install pdf2image

# Libraries

In [2]:
from pathlib import Path
from bib_handling_code.processbib import read_bibfile
from bib_handling_code.processbib import save_to_file
import pandas as pd
import os
import numpy as np
from ast import literal_eval

# Imports

## DIAG bib

In [3]:
path_diag_bib = os.path.join('..', 'diag.bib')
path_output_diag_bib = os.path.join('..', 'diag_taverne.bib')

In [4]:
diag_bib_raw = read_bibfile(None, path_diag_bib) # I changed the code in such a way that IF I give a second argument, it uses the second argument as a full path

In [5]:
bibkeys_all = []
for entry in diag_bib_raw:
    if entry.type == 'string':
        continue
    if 'year' in entry.fields:
        if (int(entry.fields['year'].strip('{}')) < 2015):
            continue
    bibkeys_all.append(entry.key)

In [6]:
print(len(bibkeys_all))
bibkeys_all[:5]

865


['Genu22', 'Abel19', 'Alee18', 'Altu20', 'Alves21a']

## SS match data

### multiple_bibkeys_maybe_multiple_ss_ids
- bibkey : many
- ss_id: none, one, many 

In [7]:
df_matches_many = pd.read_csv('script_data/TEMP_LOOKUP_multiple_bibkeys_maybe_multiple_ss_ids.csv')
# df_matches_multiple.head(3)

In [8]:
df_matches_many.head()

Unnamed: 0,ss_id,ss_title,max_bibkey,max_bib_title,max_ratio,up80_bibkeys,up80_bib_titles,up80_ratios
0,9fabf1bc1c126702a051ec56474af2d1518ad0ba,Focal fatty areas in the myocardium of patient...,Adri11c,Focal fatty areas in the myocardium of patient...,0.994872,"['Adri09', 'Adri11c']",['Fatty foci in the myocardium in patients wit...,"[0.8020833333333334, 0.9948717948717949]"
1,979a9f247700d00ff2c3f0612d5eb001379f93c8,The Medical Segmentation Decathlon,Anto21,The Medical Segmentation Decathlon,1.0,"['Anto21', 'Anto22']","['The Medical Segmentation Decathlon', 'The Me...","[1.0, 1.0]"
2,c397c6f1480ac8e3ed875adad96e9b3e00c37f26,The Medical Segmentation Decathlon,Anto21,The Medical Segmentation Decathlon,1.0,"['Anto21', 'Anto22']","['The Medical Segmentation Decathlon', 'The Me...","[1.0, 1.0]"
3,b3dc561dd990cebc626e10318b8582a198aa3571,iW-Net: an automatic and minimalistic interact...,Ares18,iW-Net: an automatic and minimalistic interact...,1.0,"['Ares18', 'Ares19']",['iW-Net: an automatic and minimalistic intera...,"[1.0, 1.0]"
4,0655dcaa39cf41a3609974840f91300d73b4aed1,The Liver Tumor Segmentation Benchmark (LiTS),Bili19,The Liver Tumor Segmentation Benchmark (LiTS),1.0,"['Bili19', 'Bili22']",['The Liver Tumor Segmentation Benchmark (LiTS...,"[1.0, 1.0]"


In [9]:
bibkeys_many = []
for str_list in list(df_matches_many['up80_bibkeys']):
    bibkeys_many.extend(literal_eval(str_list))
print(len(bibkeys_many))
bibkeys_many = list(np.unique(bibkeys_many))
bibkeys_many[:5]

228


['Adri09', 'Adri11c', 'Anto21', 'Anto22', 'Ares18']

### matches_single_bibkey_with_none_or_many_ss_ids
Filename is misleading right now, during further investigation we removed the 1-to-1 matches because these were fine
- bibkey: one
- ss_id: none, many

In [10]:
df_matches_one_to_none_or_many = pd.read_csv('script_data/TEMP_LOOKUP_matches_single_bibkey_with_none_one_or_many_ss_ids.csv')
# df_matches_one = df_matches_one[~df_matches_one['bibkey'].isin(many_bibkeys)]
# df_matches_one = df_matches_one[[col for col in df_matches_one.columns if col != "Unnamed: 0"]]

In [11]:
print(len(df_matches_one_to_none_or_many))
df_matches_one_to_none_or_many.head()

112


Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year
0,Abel19,article,"Computational pathology definitions, best prac...","Abels, Esther and Pantanowitz, Liron and Aeffn...",,,,2019.0
1,Amga20,article,Report on computational assessment of Tumor In...,Mohamed Amgad and and Elisabeth Specht Stovgaa...,b4c4c3dc91d42114023b0575c3e2273b87446ff7,"['69999230b02054b82254684a73bb8a4c83878d28', '...",[ss_ids: same DOI] title matching: single bibk...,2020.0
2,Anto21,article,The Medical Segmentation Decathlon,Michela Antonelli and Annika Reinke and Spyrid...,,,,2021.0
3,Anto22,article,The {Medical} {Segmentation} {Decathlon,"Antonelli, Michela and Reinke, Annika and Baka...",979a9f247700d00ff2c3f0612d5eb001379f93c8,"['979a9f247700d00ff2c3f0612d5eb001379f93c8', '...",[ss_ids: same DOI] multiple doi matches ['979a...,2022.0
4,Ares18,article,iW-Net}: an automatic and minimalistic interac...,Guilherme Aresta and Colin Jacobs and Teresa A...,,,,2018.0


In [12]:
bibkeys_one_to_none_or_many = df_matches_one_to_none_or_many['bibkey']
bibkeys_one_to_none_or_many[:5]

0    Abel19
1    Amga20
2    Anto21
3    Anto22
4    Ares18
Name: bibkey, dtype: object

### df one-to-none

In [13]:
df_matches_one_to_none = df_matches_one_to_none_or_many[pd.isna(df_matches_one_to_none_or_many['ss_id'])]

In [14]:
print(len(df_matches_one_to_none))
df_matches_one_to_none.head()

80


Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year
0,Abel19,article,"Computational pathology definitions, best prac...","Abels, Esther and Pantanowitz, Liron and Aeffn...",,,,2019.0
2,Anto21,article,The Medical Segmentation Decathlon,Michela Antonelli and Annika Reinke and Spyrid...,,,,2021.0
4,Ares18,article,iW-Net}: an automatic and minimalistic interac...,Guilherme Aresta and Colin Jacobs and Teresa A...,,,,2018.0
9,Bili19,article,The {L}iver {T}umor {S}egmentation {B}enchmark...,Patrick Bilic and Patrick Ferdinand Christ and...,,,,2019.0
12,Busc16,article,Risk factors for COPD exacerbations in inhaled...,"Busch, Robert and Han, MeiLan K and Bowler, Ru...",,,,2016.0


In [15]:
bibkeys_one_to_none = list(df_matches_one_to_none['bibkey'])
bibkeys_one_to_none[:5]

['Abel19', 'Anto21', 'Ares18', 'Bili19', 'Busc16']

### df one-to-many

In [16]:
df_matches_one_to_many = df_matches_one_to_none_or_many[~pd.isna(df_matches_one_to_none_or_many['ss_id'])]

In [17]:
print(len(df_matches_one_to_many))
df_matches_one_to_many.head()

32


Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year
1,Amga20,article,Report on computational assessment of Tumor In...,Mohamed Amgad and and Elisabeth Specht Stovgaa...,b4c4c3dc91d42114023b0575c3e2273b87446ff7,"['69999230b02054b82254684a73bb8a4c83878d28', '...",[ss_ids: same DOI] title matching: single bibk...,2020.0
3,Anto22,article,The {Medical} {Segmentation} {Decathlon,"Antonelli, Michela and Reinke, Annika and Baka...",979a9f247700d00ff2c3f0612d5eb001379f93c8,"['979a9f247700d00ff2c3f0612d5eb001379f93c8', '...",[ss_ids: same DOI] multiple doi matches ['979a...,2022.0
5,Argu19,article,Function Follows Form: Regression from Complet...,"Argus, Max and Schaefer-Prokop, Cornelia and L...",6425e3f4c37f8deb9e8dc933e34d49aa843635b9,"['6425e3f4c37f8deb9e8dc933e34d49aa843635b9', '...",[ss_ids: no DOIs!] title matching: single bibk...,2019.0
6,Balk19,article,Deep learning and manual assessment show that ...,"Balkenhol, Maschenka C A and Bult, Peter and T...",94962ef85aca6df5c8289874b56bcaa8722dc596,"['91bcebba717670d740ef76097a0885a0bd3c9dde', '...",[ss_ids: same DOI] title matching: single bibk...,2019.0
7,Balt19b,article,Optimization of the difference-of-Gaussian cha...,"Balta, Christiana and Bouwman, Ramona W and Br...",f131ef217543d179269018950bf3b6ba2b30f3b1,"['20fe29e553477d9bd330f052e4d01c0c1c616b22', '...",[ss_ids: multiple DOIs!] title matching: singl...,2019.0


In [18]:
bibkeys_one_to_many = list(df_matches_one_to_many['bibkey'])
bibkeys_one_to_many[:5]

['Amga20', 'Anto22', 'Argu19', 'Balk19', 'Balt19b']

### df one-to-one

In [19]:
df_matches_one_to_one = pd.read_csv('script_data/df_matches.csv')
df_matches_one_to_one = df_matches_one_to_one[~df_matches_one_to_one['type'].isin(['conference', 'book', 'inbook', 'mastersthesis', 'phdthesis', 'patent', 'misc'])]
df_matches_one_to_one = df_matches_one_to_one[df_matches_one_to_one['comment'].isin(['title matching: single bibkey, single ss_id', 'single doi match'])]
df_matches_one_to_one['all_ss_ids'] = df_matches_one_to_one['ss_id'].apply(lambda x: [x])
df_matches_one_to_one.to_csv('script_data/df_matches_one_to_one.csv', index=False)
df_matches_one_to_one = pd.read_csv('script_data/df_matches_one_to_one.csv')

In [20]:
df_matches_one_to_one.head()

Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment
0,Abra08a,article,Evaluation of a system for automatic detection...,M. D. Abr\`amoff and M. Niemeijer and M. S. A....,0445a1a0aa30c9ca41a8ab31463d617b849a0814,['0445a1a0aa30c9ca41a8ab31463d617b849a0814'],"title matching: single bibkey, single ss_id"
1,Adri10,article,Echocardiographic screening results in patient...,Miraude E A P M Adriaensen and Maarten J M Cra...,3da4ab4d50211eb7a0148c4ed74729b2d4e30ba9,['3da4ab4d50211eb7a0148c4ed74729b2d4e30ba9'],"title matching: single bibkey, single ss_id"
2,Adri11,article,Radiological evidence of lymphangioleiomyomato...,M. E A P M Adriaensen and C. M. Schaefer-Proko...,691c90e68eab1dcae5861fba0eab73ba626e6777,['691c90e68eab1dcae5861fba0eab73ba626e6777'],"title matching: single bibkey, single ss_id"
3,Adri11b,article,Mature fat cells in the myocardium of patients...,Miraude E A P M Adriaensen and Matthijs F M va...,c738da78dc1fb336a8dff5a8047059d52a6de939,['c738da78dc1fb336a8dff5a8047059d52a6de939'],"title matching: single bibkey, single ss_id"
4,Adri11c,article,Focal fatty areas in the myocardium of patient...,Miraude E A P M Adriaensen and Harm H H Fering...,9fabf1bc1c126702a051ec56474af2d1518ad0ba,['9fabf1bc1c126702a051ec56474af2d1518ad0ba'],single doi match


In [21]:
bibkeys_one_to_one = list(df_matches_one_to_one['bibkey'])
bibkeys_one_to_one[:5]

['Abra08a', 'Adri10', 'Adri11', 'Adri11b', 'Adri11c']

# MANUAL_LOOKUP files

In [22]:
df_ML_one_to_many = pd.read_excel(r'script_data/complete_MANUAL_LOOKUP_matches_single_bibkey_with_many_ss_ids_KM.xlsx')
df_ML_many = pd.read_excel(r'script_data/complete_MANUAL_LOOKUP_multiple_bibkeys_maybe_multiple_ss_ids.xlsx')
display(df_ML_one_to_many.head(2))
display(df_ML_many.head(2))

Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year,doi_comment,ADD_SS_TO_THIS_BIB?,ADD_SS_TO_BLACKLIST?,COMMENT
0,Kooi17d,article,Classifying Symmetrical Differences and Tempor...,Thijs Kooi and Nico Karssemeijer,a908e3775fd064b0d249728a7261cfb1efa13b87,"['a908e3775fd064b0d249728a7261cfb1efa13b87', '...",title matching: single bibkey with multiple_ss...,2017,"[ss_ids: 1 no DOI, 1 have same DOI]","['True', 'False']","['False', 'False']","The first one is SPIE, the second one is arxiv..."
1,Band23,article,Continual learning strategies for cancer-indep...,"B\'{a}ndi, P. and Balkenhol, Maschenka and van...",4bcd672218ecec70473c84f6f1cc52c64031f3e5,"['4bcd672218ecec70473c84f6f1cc52c64031f3e5', '...",title matching: single bibkey with multiple_ss...,2023,[ss_ids: contain multiple DOIs!],"['True', 'False']","['False', 'False']",The second one is a different publication (arx...


Unnamed: 0,ss_id,ss_title,max_bibkey,max_bib_title,max_ratio,up80_bibkeys,up80_bib_titles,up80_ratios,ADD_SS_TO_BIB?,BLACKLIST_SS?,COMMENT,ASSIGNED
0,9fabf1bc1c126702a051ec56474af2d1518ad0ba,Focal fatty areas in the myocardium of patient...,Adri11c,Focal fatty areas in the myocardium of patient...,0.994872,"['Adri09', 'Adri11c']",['Fatty foci in the myocardium in patients wit...,"[0.8020833333333334, 0.9948717948717949]","['False', 'True']",False,2fddc4cdc2422aa7c74a013bb8b41e976772f9bc' is t...,Marina
1,979a9f247700d00ff2c3f0612d5eb001379f93c8,The Medical Segmentation Decathlon,Anto21,The Medical Segmentation Decathlon,1.0,"['Anto21', 'Anto22']","['The Medical Segmentation Decathlon', 'The Me...","[1.0, 1.0]","['True', 'True']",False,"Anto21 is arxiv, Anto22 is journal",Marina


In [23]:
len(df_ML_one_to_many)

33

In [24]:
len(df_matches_one_to_many)

32

In [25]:
len(df_ML_many)

102

In [26]:
len(df_matches_many)

102

# BIBKEYS CHECK 
### Feel free to minimize this and continue from "Add ss_ids to DAIG bib"
#### Summary of what we checked here:
Checked overlap of the subsets. The confusion was that there can be multiple bibkeys that match with a single or more SS_ids. These we saved in a separate csv.

The bibkeys from this csv WERE NOT matched specifically becasue matching (on title) could be ambiguous and should be done manually, however, it could be that some bibkeys WERE matched on DOI. Therefore there is overlap of bibkeys in all subsets

In [27]:
print(len(bibkeys_one_to_none))
len(set(bibkeys_one_to_none))

80


80

In [28]:
print(len(bibkeys_one_to_many))
len(set(bibkeys_one_to_many))

32


32

In [29]:
print(len(bibkeys_one_to_one))
len(set(bibkeys_one_to_one))

940


940

In [30]:
print(len(bibkeys_many))
len(set(bibkeys_many))

151


151

In [31]:
print(len(bibkeys_one_to_none + bibkeys_one_to_many + bibkeys_one_to_one + bibkeys_many))
len(set(bibkeys_one_to_none + bibkeys_one_to_many + bibkeys_one_to_one + bibkeys_many))

1203


1102

In [32]:
print(df_matches_one_to_many[df_matches_one_to_many['bibkey']=='Band23']['all_ss_ids'].item())

['4bcd672218ecec70473c84f6f1cc52c64031f3e5', 'bc6b483f9b1fa630fa4b43158b13716f1ca7497b']


In [33]:
bibkeys_marina = ['Balo13']

In [34]:
bibkey_lists = [bibkeys_one_to_none, bibkeys_one_to_many, bibkeys_one_to_one, bibkeys_many, bibkeys_marina]
bibkey_lists_names = ['bibkeys_one_to_none', 'bibkeys_one_to_many', 'bibkeys_one_to_one', 'bibkeys_many', 'bibkeys_marina']

In [35]:
for i, lst in enumerate(bibkey_lists):
    print(f"List {bibkey_lists_names[i]} contains:")
    for j, other_lst in enumerate(bibkey_lists):
        if i != j:
            common_items = len(set(lst) & set(other_lst))
            print(f"- {common_items} items from List {bibkey_lists_names[j]}")
    print('\n')

List bibkeys_one_to_none contains:
- 0 items from List bibkeys_one_to_many
- 0 items from List bibkeys_one_to_one
- 14 items from List bibkeys_many
- 0 items from List bibkeys_marina


List bibkeys_one_to_many contains:
- 0 items from List bibkeys_one_to_none
- 0 items from List bibkeys_one_to_one
- 2 items from List bibkeys_many
- 0 items from List bibkeys_marina


List bibkeys_one_to_one contains:
- 0 items from List bibkeys_one_to_none
- 0 items from List bibkeys_one_to_many
- 85 items from List bibkeys_many
- 0 items from List bibkeys_marina


List bibkeys_many contains:
- 14 items from List bibkeys_one_to_none
- 2 items from List bibkeys_one_to_many
- 85 items from List bibkeys_one_to_one
- 0 items from List bibkeys_marina


List bibkeys_marina contains:
- 0 items from List bibkeys_one_to_none
- 0 items from List bibkeys_one_to_many
- 0 items from List bibkeys_one_to_one
- 0 items from List bibkeys_many




### bibkeys_many_in_one_to_one

These are cases where only 1 bibkey was an article

In [36]:
bibkeys_many_in_one_to_one = list(set(bibkeys_many) & set(bibkeys_one_to_one))
print(len(bibkeys_many_in_one_to_one))
bibkeys_many_in_one_to_one[:5]

85


['Heuv18a', 'Ginn09a', 'Brui04', 'Tan12', 'Kall10']

In [37]:
double_idx=3

In [38]:
double = bibkeys_many_in_one_to_one[double_idx]

print('df_matches_one_to_one')
display(df_matches_one_to_one[df_matches_one_to_one['bibkey']==double])

print('df_matches_many')
df_matches_many[df_matches_many['up80_bibkeys'].apply(lambda lst: double in lst)]

df_matches_one_to_one


Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment
802,Tan12,inproceedings,Detection of Breast Cancer in Automated {3D} B...,Tao Tan and Bram Platel and Roel Mus and Nico ...,443e44a373875423637e9b519b451031635f2d35,['443e44a373875423637e9b519b451031635f2d35'],single doi match


df_matches_many


Unnamed: 0,ss_id,ss_title,max_bibkey,max_bib_title,max_ratio,up80_bibkeys,up80_bib_titles,up80_ratios
92,2704781e2ff16f7f31bd97e4d393e685b9ab0875,Classification of breast lesions in automated ...,Tan11,Classification of Breast Lesions in Automated ...,1.0,"['Tan11', 'Tan12']",['Classification of Breast Lesions in Automate...,"[1.0, 0.8095238095238095]"
93,443e44a373875423637e9b519b451031635f2d35,Detection of breast cancer in automated 3D bre...,Tan12,Detection of Breast Cancer in Automated 3D Bre...,1.0,"['Tan11', 'Tan12', 'Tan13a']",['Classification of Breast Lesions in Automate...,"[0.8095238095238095, 1.0, 0.828125]"
94,8e7a787fc98d409a3f6c63826e53c60210894c19,Computer-Aided Detection of Cancer in Automate...,Tan13a,Computer-aided Detection of Cancer in Automate...,0.992701,"['Kars11b', 'Tan11d', 'Tan12', 'Tan13a']",['Computer Aided Interpretation of Lesions in ...,"[0.8251748251748252, 0.8251748251748252, 0.821..."


### bibkeys_many_in_one_to_none

 these are cases that are in many, but they are not one-to-one or one-to-many, so they were not matched

In [39]:
bibkeys_many_in_one_to_none = list(set(bibkeys_many) & set(bibkeys_one_to_none))
print(len(bibkeys_many_in_one_to_none))
bibkeys_many_in_one_to_none[:5]

14


['Ares18', 'Leem18b', 'Moor2018', 'Xie21a', 'Anto21']

In [40]:
double_idx = 0

In [41]:
double = bibkeys_many_in_one_to_none[double_idx]

print('df_matches_one_to_one')
display(df_matches_one_to_none[df_matches_one_to_none['bibkey']==double])

print('df_matches_many')
df_matches_many[df_matches_many['up80_bibkeys'].apply(lambda lst: double in lst)]

df_matches_one_to_one


Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year
4,Ares18,article,iW-Net}: an automatic and minimalistic interac...,Guilherme Aresta and Colin Jacobs and Teresa A...,,,,2018.0


df_matches_many


Unnamed: 0,ss_id,ss_title,max_bibkey,max_bib_title,max_ratio,up80_bibkeys,up80_bib_titles,up80_ratios
3,b3dc561dd990cebc626e10318b8582a198aa3571,iW-Net: an automatic and minimalistic interact...,Ares18,iW-Net: an automatic and minimalistic interact...,1.0,"['Ares18', 'Ares19']",['iW-Net: an automatic and minimalistic intera...,"[1.0, 1.0]"


### bibkeys_many_in_one_to_many

In [42]:
bibkeys_many_in_one_to_many = list(set(bibkeys_many) & set(bibkeys_one_to_many))
print(len(bibkeys_many_in_one_to_many))
bibkeys_many_in_one_to_many[:5]

2


['Anto22', 'Wand17a']

In [43]:
double_idx = 1

In [44]:
double = bibkeys_many_in_one_to_many[double_idx]

print('df_matches_one_to_one')
display(df_matches_one_to_many[df_matches_one_to_many['bibkey']==double])

print('df_matches_many')
df_matches_many[df_matches_many['up80_bibkeys'].apply(lambda lst: double in lst)]

df_matches_one_to_one


Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment,year
89,Wand17a,article,The effect of volumetric breast density on the...,"Wanders, Johanna O.P. and Holland, Katharina a...",d677e08922a1c646ef739f06bae679cc795ba362,"['10123038b80006871abc7396810bed15bfa29387', '...",[ss_ids: same DOI] multiple doi matches ['1012...,2017.0


df_matches_many


Unnamed: 0,ss_id,ss_title,max_bibkey,max_bib_title,max_ratio,up80_bibkeys,up80_bib_titles,up80_ratios
98,d677e08922a1c646ef739f06bae679cc795ba362,The effect of volumetric breast density on the...,Wand17a,The effect of volumetric breast density on the...,1.0,"['Wand15b', 'Wand16', 'Wand17a']",['Volumetric breast density and the risk of sc...,"[0.8181818181818182, 0.8282828282828283, 1.0]"
99,10123038b80006871abc7396810bed15bfa29387,The effect of volumetric breast density on the...,Wand17a,The effect of volumetric breast density on the...,1.0,"['Wand15b', 'Wand16', 'Wand17a']",['Volumetric breast density and the risk of sc...,"[0.8181818181818182, 0.8282828282828283, 1.0]"


# Add ss_ids to DAIG bib

#### Logic:
Loop over entries:
- Skip Journal strings
- Look up entry key (bibkey) in df_matches_one
    - If bibkey not in df, skip
- Check if there are SS matches for this bibkey
    - If so, add this row's 'all_ss_ids' column to the entry

# One-to-one

In [45]:
df_matches_one_to_one[df_matches_one_to_one['ss_id']=='271e85d19b4cd1997a110525ffb66df7a8fa7fe6']

Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment
120,Ciom16a,article,Computer-aided detection of intracoronary sten...,"Ciompi, Francesco and Balocco, Simone and Rigl...",271e85d19b4cd1997a110525ffb66df7a8fa7fe6,['271e85d19b4cd1997a110525ffb66df7a8fa7fe6'],"title matching: single bibkey, single ss_id"


In [46]:
df_matches_one_to_one[df_matches_one_to_one['ss_id']=='de1fc90d61276094416b430605d47ee5f8cc7b4d']

Unnamed: 0,bibkey,type,title,authors,ss_id,all_ss_ids,comment


In [47]:
for entry in diag_bib_raw:
    ### skip Journal strings
    if entry.type == 'string':
        continue
    
    # Get data
    bibkey = entry.key
    match_row = df_matches_one_to_one[df_matches_one_to_one['bibkey'] == bibkey]
    
    # Double checks
    if len(match_row) == 0:
        continue
    assert len(match_row) == 1, 'multiple rows should not be possible in this DF'
    match_row = match_row.iloc[0]
    
    # If there is a match
    if pd.notna(match_row['all_ss_ids']):
        ## SS ID
        # If the field is already filled, check if the same
        if 'ss_id' in entry.fields:
            if not entry.fields['ss_id'] == '{' + match_row['ss_id'] + '}':
                print('ss_id data not the same\n', bibkey, entry.fields['ss_id'], '{' + match_row['ss_id'] + '}')
                break
        # Else fill field
        else: 
            entry.fields['ss_id'] = '{' + match_row['ss_id'] + '}'
            
        ## SS ID
        # If the field is already filled, check if the same
        if 'all_ss_ids' in entry.fields:
            if not entry.fields['all_ss_ids'] == '{' + match_row['all_ss_ids'] + '}':
                print('all_ss_ids data not the same\n', bibkey, entry.fields['all_ss_ids'], '\n', match_row)
                break
        # Else fill field
        else: 
            entry.fields['all_ss_ids'] = '{' + match_row['all_ss_ids'] + '}'
    
    
#     if entry.type == 'phdthesis':
#         entry.fields['journal'] = '{PhD thesis}'
#     if entry.type == 'mastersthesis':
#         entry.fields['journal'] = '{Master thesis}'
#     else:
#         pass

In [48]:
count = 0
for entry in diag_bib_raw:
    if 'ss_id' in entry.fields:
        count += 1
print(count)

940


### One-to-many

In [49]:
def bibkey_to_one_to_many_add_ss_ids_list(bibkey):
    assert len(df_ML_one_to_many[df_ML_one_to_many['bibkey']==bibkey]) == 1, display(df_ML_one_to_many[df_ML_one_to_many['bibkey']==bibkey])
    row = df_ML_one_to_many[df_ML_one_to_many['bibkey']==bibkey]
    all_ss_ids = literal_eval(row['all_ss_ids'].item())
    add = literal_eval(row['ADD_SS_TO_THIS_BIB?'].item())
    add = [literal_eval(tf) for tf in add]
    assert len(all_ss_ids) == len(add), 'all_ss_ids and add should have same len'
    add_ss_ids = [ss_id for ss_id, tf in zip(all_ss_ids, add) if tf==True]
    return add_ss_ids

In [50]:
bibkey_to_one_to_many_add_ss_ids_list('Amga20')

['69999230b02054b82254684a73bb8a4c83878d28',
 'b4c4c3dc91d42114023b0575c3e2273b87446ff7']

#### LOOP

In [None]:
for entry in diag_bib_raw:
    ### skip Journal strings
    if entry.type == 'string':
        continue
    
    # Get data
    bibkey = entry.key
    match_row = df_ML_one_to_many[df_ML_one_to_many['bibkey'] == bibkey]
    # Double checks
    if len(match_row) == 0:
        continue
    elif len(match_row)>1:
        print(match_row)
        print('len>1')
        break
    else:
        pass
        # display(match_row)
        # print(len(match_row))
    assert len(match_row) == 1, 'multiple rows should not be possible in this DF'
    match_row = match_row.iloc[0]
    
    # If there is a match
    if pd.notna(match_row['all_ss_ids']):
        # get add_ss_ids based on manual matching
        # print(bibkey)
        add_ss_ids = bibkey_to_one_to_many_add_ss_ids_list(bibkey)
        # print(add_ss_ids)
        ## SS ID
        # If the field is already filled, check if the same
        # if 'ss_id' in entry.fields:
        #     if not entry.fields['ss_id'] == '{' + match_row['ss_id'] + '}':
        #         print('ss_id data not the same\n', bibkey, entry.fields['ss_id'], '{' + match_row['ss_id'] + '}')
        #         break
        # # Else fill field
        # else: 
        #     entry.fields['ss_id'] = '{' + str(add_ss_ids) + '}'
            
        ## ALL SS IDS
        # If the field is already filled, check if the same
        if 'all_ss_ids' in entry.fields:
            if not entry.fields['all_ss_ids'] == '{' + str(add_ss_ids) + '}':
                print('all_ss_ids data not the same\n', bibkey, entry.fields['all_ss_ids'], '\n', str(add_ss_ids))
                previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
                new = add_ss_ids
                combined = list(set(previous) | set(new))
                print(f'replacing with combined: {combined}')
                entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
                # break
        # Else fill field
        else: 
            entry.fields['all_ss_ids'] = '{' + str(add_ss_ids) + '}'
    


In [None]:
count = 0
for entry in diag_bib_raw:
    if 'all_ss_ids' in entry.fields:
        count += 1
print(count)

## Many to many

In [None]:
display(df_ML_many.head(2))

In [None]:
bibkey = 'Anto21'

In [None]:
df_temp_bibkey_match = df_ML_many.copy()[df_ML_many['up80_bibkeys'].apply(lambda x: bibkey in x)]
df_temp_bibkey_match['bibkey_idx'] = df_temp_bibkey_match['up80_bibkeys'].apply(lambda x: literal_eval(x).index(bibkey))
df_temp_bibkey_match['add_ss_id'] = df_temp_bibkey_match.apply(lambda row: literal_eval(literal_eval(row['ADD_SS_TO_BIB?'])[row['bibkey_idx']]), axis=1)
add_ss_ids = list(df_temp_bibkey_match[df_temp_bibkey_match['add_ss_id']]['ss_id'])

In [None]:
bibkey = 'Adri11c'

In [None]:
df_temp_bibkey_match = df_ML_many.copy()[df_ML_many['up80_bibkeys'].apply(lambda x: bibkey in x)]
df_temp_bibkey_match['bibkey_idx'] = df_temp_bibkey_match['up80_bibkeys'].apply(lambda x: literal_eval(x).index(bibkey))
df_temp_bibkey_match['add_ss_id'] = df_temp_bibkey_match.apply(lambda row: literal_eval(literal_eval(row['ADD_SS_TO_BIB?'])[row['bibkey_idx']]), axis=1)
add_ss_ids = list(df_temp_bibkey_match[df_temp_bibkey_match['add_ss_id']]['ss_id'])
add_ss_ids

In [None]:
def bibkey_to_many_add_ss_ids_list(bibkey, df_temp):
    # print(bibkey)
    df_temp['bibkey_idx'] = df_temp['up80_bibkeys'].apply(lambda x: literal_eval(x).index(bibkey))
    # try:
    df_temp['add_ss_id'] = df_temp.apply(lambda row: literal_eval(literal_eval(row['ADD_SS_TO_BIB?'])[row['bibkey_idx']]), axis=1)
    # except:
    #     df_temp.apply(lambda row: print(row['ADD_SS_TO_BIB?'], '\n\n'), axis=1)
    #     df_temp.apply(lambda row: print(literal_eval(row['ADD_SS_TO_BIB?']),'\n\n'), axis=1)
    #     df_temp.apply(lambda row: print(literal_eval(row['ADD_SS_TO_BIB?'])[row['bibkey_idx']],'\n\n'), axis=1)
    #     df_temp.apply(lambda row: print(literal_eval(literal_eval(row['ADD_SS_TO_BIB?'])[row['bibkey_idx']])), axis=1)
    add_ss_ids = list(df_temp[df_temp['add_ss_id']]['ss_id'])
    return add_ss_ids if add_ss_ids!=[] else None

In [None]:
for entry in diag_bib_raw:
    ### skip Journal strings
    if entry.type == 'string':
        continue
    
    # Get data
    bibkey = entry.key
    df_temp_bibkey_match = df_ML_many.copy()[df_ML_many['up80_bibkeys'].apply(lambda x: bibkey in literal_eval(x))]
    # Double checks
    if len(df_temp_bibkey_match) == 0:
        continue
    
    
    # If there is a match
    # get add_ss_ids based on manual matching
    # print(bibkey)
    add_ss_ids = bibkey_to_many_add_ss_ids_list(bibkey, df_temp_bibkey_match)
    
    if add_ss_ids == None:
        continue
    # print(add_ss_ids)
    ## SS ID
    # If the field is already filled, check if the same
    # if 'ss_id' in entry.fields:
    #     if not entry.fields['ss_id'] == '{' + match_row['ss_id'] + '}':
    #         print('ss_id data not the same\n', bibkey, entry.fields['ss_id'], '{' + match_row['ss_id'] + '}')
    #         break
    # # Else fill field
    # else: 
    #     entry.fields['ss_id'] = '{' + str(add_ss_ids) + '}'

    ## ALL SS IDS
    # If the field is already filled, check if the same
    if 'all_ss_ids' in entry.fields:
        if not set(literal_eval(entry.fields['all_ss_ids'].strip('{}'))) == set(add_ss_ids):
            print('all_ss_ids data not the same\n', bibkey, '\n--- bib:', 
                  set(literal_eval(entry.fields['all_ss_ids'].strip('{}'))), 
                  '\n--- trying_to_add:', set(add_ss_ids))
            previous = literal_eval(entry.fields['all_ss_ids'].strip('{}'))
            new = add_ss_ids
            combined = list(set(previous) | set(new))
            print(f'replacing with combined: {combined}')
            entry.fields['all_ss_ids'] = '{' + str(combined) + '}'
            # break
    # Else fill field
    else: 
        entry.fields['all_ss_ids'] = '{' + str(add_ss_ids) + '}'
    

In [None]:
count = 0
for entry in diag_bib_raw:
    if 'all_ss_ids' in entry.fields:
        count += 1
print(count)

# Next steps:
- Flag many to none-one-many bibkeys
    - these need to be done manually
    - check one-to-many cases and decide what to do with these matches

# Save new diag_taverne.bib

Here its important to be aware that withing the processbib.py file from the bib handling code, there is a hardcoded 'allowed_fields' variable on top of the file. Only fields with names with in this list can be saved to the diag.bib

In [None]:
path_output_diag_bib = os.path.join('script_data', 'diag_ss.bib')

In [None]:
# save_to_file(diag_bib_raw, None, path_output_diag_bib)