In [1]:
## inspect bibliography for duplicates and missing abstracts

In [1]:
import nbib
import rispy

In [2]:
refs = nbib.read_file('bib/pubmed-9NOTanimal-set 2.nbib')

In [17]:
len(refs)

6879

In [23]:
for entries in refs[:1]:
    print(entries['authors'][0])

{'author': 'Barchi, Alberto', 'author_abbreviated': 'Barchi A', 'affiliations': ['Gastroenterology and Digestive Endoscopy, IRCCS Ospedale San Raffaele, Milan, Italy.'], 'first_name': 'Alberto', 'last_name': 'Barchi'}


In [25]:
# ASReview does not support nbib, I prefer therefore to export it to RIS
filepath = 'bib/export-pubmed-9NOTanimal-set 2.ris'

with open(filepath, 'w') as bibliography_file:
    rispy.dump(refs, bibliography_file)



In [26]:
from asreview import ASReviewData

class tidyBib():
    def __init__(self, ris):
        # load .ris file using asreview
        self.ris = ASReviewData.from_file(ris).to_dataframe()
        
    @staticmethod
    def is_missing_abstract(abstract):
        if isinstance(abstract, str):
            return abstract.strip() == ''
        return pd.isna(abstract)
    
    @staticmethod
    def resolve_duplicates(group):
        # Handle duplicates: retain one entry, preferring the one with an abstract
        if group['abstract'].notna().any():
            return group.loc[group['abstract'].notna()].iloc[0]
        return group.iloc[0]
        

    def return_resolved_ris(self):
        # Identify and resolve duplicates
        resolved_ris = self.ris.groupby(['title', 'authors'], group_keys=False).apply(self.resolve_duplicates)
        return resolved_ris
    
    def return_missing_abstracts(self):
        # Identify entries with missing abstracts
        missing_abstracts = self.ris[self.ris['abstract'].apply(self.is_missing_abstract)]
        return missing_abstracts
        
        
    def return_tidy_bib(self):
        # identify and resolve duplicates
        # remove missing abstracts
        # return tidy bib
        
        resolved_ris = self.return_resolved_ris()
        tidy_ris = resolved_ris[~resolved_ris['abstract'].apply(self.is_missing_abstract)]
        
        return tidy_ris

    def save_tidy_ris(self, output_file):
        # save tidy bib
        tidy_ris = self.return_tidy_bib()
        tidy_ris.to_csv(output_file)
        return tidy_ris
    

  from tqdm.autonotebook import tqdm, trange


In [27]:
tidy = tidyBib('bib/export-pubmed-9NOTanimal-set 2.ris')

In [28]:
tidy.ris.title.nunique()

6874

In [42]:
len(tidy.ris)

6879

In [29]:
tidy.ris.head()

Unnamed: 0_level_0,type_of_reference,title,abstract,authors,language,keywords,doi
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,JOUR,Microbiota profiling in esophageal diseases: N...,Gut microbiota is recognized nowadays as one o...,"[""{'author': 'Barchi, Alberto', 'author_abbrev...",eng,"['Achalasia', 'Cancer', 'Eosinophilic esophagi...",10.1016/j.csbj.2023.12.026
1,JOUR,Oral bacteriome and mycobiome of patients with...,ETHNOPHARMACOLOGICAL RELEVANCE: Moshen Fuyuan ...,"[""{'author': 'Wang, Xin-Hui', 'author_abbrevia...",eng,"['Idiopathic membranous nephropathy (IMN)', 'M...",10.1016/j.jep.2024.118233
2,JOUR,High-throughput microfluidic quantitative PCR ...,Comprehensive data on bacterial and viral path...,"[""{'author': 'Shrestha, Sadhana', 'author_abbr...",eng,"['Antibiotic resistance gene', 'Class 1 integr...",10.1016/j.envres.2024.119156
3,JOUR,Prevalence and characterization of aminoglycos...,Campylobacter jejuni is recognized as a signif...,"[""{'author': 'Kang, Jin', 'author_abbreviated'...",eng,"['Aminoglycoside resistance', 'Campylobacter j...",10.1016/j.ijfoodmicro.2024.110747
4,JOUR,Wastewater surveillance together with metaviro...,How to address public health priorities after ...,"['{\'author\': \'Zhang, Ziqiang\', \'author_ab...",eng,"['Amplicon sequencing', 'Metaviromic data', 'N...",10.1016/j.jhazmat.2024.134635


In [43]:
len(tidy.return_missing_abstracts())

216

In [44]:
missing_abstracts = tidy.return_missing_abstracts()

In [46]:
len(refs) - len(missing_abstracts)

6663

In [36]:
missing_abstracts.head()

Unnamed: 0_level_0,type_of_reference,title,abstract,authors,language,keywords,doi
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
938,JOUR,"Giardiasis Outbreaks-United States, 2012-2017.",,,eng,,10.1097/INF.0000000000003180
1087,JOUR,[The effect of gut microbiota composition on i...,,"['{\'author\': \'Yang, T\', \'author_abbreviat...",chi,,10.3760/cma.j.cn112140-20200508-00480
1162,JOUR,Treatment of giardiasis in children: Randomize...,,"['{\'author\': \'Vakkilainen, Svetlana\', \'au...",eng,,10.1016/j.jinf.2020.08.050
1226,JOUR,The Growth of Young Children and Associations ...,,"[""{'author': 'Stine, O Colin', 'author_abbrevi...",eng,,10.1093/cid/ciz907
1257,JOUR,Refractory giardiasis in medical students retu...,,"[""{'author': 'Dao, Thi Loi', 'author_abbreviat...",eng,,10.1016/j.tmaid.2019.101469


In [52]:
missing_abstracts.to_csv('bib/missing_abstracts-pubmed-9NOTanimal-set 2.csv')

In [31]:
for title in tidy.return_missing_abstracts()['title']:
    print(title)

Giardiasis Outbreaks-United States, 2012-2017.
[The effect of gut microbiota composition on immunogenicity of rotavirus vaccine].
Treatment of giardiasis in children: Randomized trial of rectal metronidazole versus oral tinidazole.
The Growth of Young Children and Associations With Their Intestinal Microbiota and Campylobacter.
Refractory giardiasis in medical students returning from humanitarian work abroad.
The skin microbiota as a link between rosacea and its systemic comorbidities.
A new strategy for high-throughput identification of human gut microbiota containing specific monosaccharide.
Examining the influence of weather on rotavirus infection.
Cryptosporidium genotyping and land use mapping for hazard identification and source tracking in a small mixed rural-urban watershed in Southeastern Brazil.
50 Years Ago in The Journal of Pediatrics: Epidemiology and Etiology of Severe Infantile Diarrhea.
Cholangiocytes and the environment in primary sclerosing cholangitis: where is the l

In [48]:
tidy_ris = tidy.return_tidy_bib()

  resolved_ris = self.ris.groupby(['title', 'authors'], group_keys=False).apply(self.resolve_duplicates)


In [49]:
len(tidy_ris)

6662

In [51]:
tidy_ris.to_csv('bib/pubmed-9NOTanimal-set 2-ready-for-ASReview.csv')