In [None]:
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

## Iterating over xml files from grobid to add body texts to the df

In [10]:
manual_papers = pd.read_csv('manually_labelled.csv')
manual_papers

Unnamed: 0,paperid,defined_concept,extracted_definitions,reference_if_def_from_another_paper,explicit_implicit,examples,relevance,notes,grobid,body_text,n_words,title,abstract
0,f6fabad4373992c433101e0d1d2b19901b39ef97,,,,none,,DISCARDED,,paper_f6fabad4373992c433101e0d1d2b19901b39ef97...,IntroductionThere are many cases in which …rms...,10255,Partial Cross Ownership and Tacit Collusion,We examine the effects that passive investment...
1,2f166f9c50a6e18407874ba739c9c7cb1b51d4f7,,,,none,,DISCARDED,spanish language,paper_2f166f9c50a6e18407874ba739c9c7cb1b51d4f7...,"1. ""QUEBRANTO"". LA EXCLUSIÓN EDUCATIVA Quizás ...",8673,"Inclusión y Exclusión Educativa. De Nuevo, ""Vo...",En este texto se plantean ideas e interrogante...
2,6756da5a0d5e037a0c143e6418b85633561c927c,,,,none,,DISCARDED,,paper_6756da5a0d5e037a0c143e6418b85633561c927c...,"ReportIn this report, we describe a simple cor...",1689,A simple correction for multiple testing for s...,"In this report, we describe a simple correctio..."
3,ed0dbdbf192829018592a7be649b2e403cbd92fa,,,,none,,DISCARDED,,paper_ed0dbdbf192829018592a7be649b2e403cbd92fa...,Changes in cell composition underlie diverse p...,7063,Robust enumeration of cell subsets from tissue...,
4,c9b94df36183d880403fcaaf5ba8b5b74d022de8,,,,none,,DISCARDED,,paper_c9b94df36183d880403fcaaf5ba8b5b74d022de8...,"IntroductionTogether with neurons, glia (astro...",11040,An RNA-Sequencing Transcriptome and Splicing D...,The major cell classes of the brain differ in ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2178,d2bfca01f27526c542c74278da145141e89d7293,,,,none,,DISCARDED,,paper_d2bfca01f27526c542c74278da145141e89d7293...,"IntroductionPlant biostimulants, or agricultur...",16450,Agricultural uses of plant biostimulants,
2179,de0535f9a2691f331609574920cb17f90a644540,,,,none,,DISCARDED,,paper_de0535f9a2691f331609574920cb17f90a644540...,"Aging, an inevitable biological process, is ch...",6285,Oxidative damage and mitochondrial decay in ag...,We argue for the critical role of oxidative da...
2180,e2bda1a9c0c5263b0812a9227460db6b710c9fac,,,,none,,DISCARDED,,paper_e2bda1a9c0c5263b0812a9227460db6b710c9fac...,IntroductionThe first cases of coronavirus dis...,2081,Tracking Social Media Discourse About the COVI...,"Background At the time of this writing, the co..."
2181,e2e796ccef8c26d5154b2e70feb9582125e6ff2f,,,,none,,DISCARDED,italian paper,paper_e2e796ccef8c26d5154b2e70feb9582125e6ff2f...,INTRODUÇÃOTrabalhos científicos envolvendo sem...,2390,Envelhecimento acelerado em sementes de rúcula...,The objective of the present study was to inve...


In [68]:
directory = 'all_pdfs_grobid'

files = os.listdir(directory)
pdf_files = [file for file in files if file.endswith('.grobid.tei.xml')]
pdf_ids = [pdf_file.split('.')[0] for pdf_file in pdf_files]
pdf_ids = [pdf_id.replace('paper_', '') for pdf_id in pdf_ids]
print(len(pdf_ids))

In [None]:
manual_papers = manual_papers[manual_papers['paperid'].isin(pdf_ids)] # example code for the 'manual_labeled_papers' df
manual_papers['grobid'] = manual_papers['paperid'].apply(lambda x: f'paper_{x}.grobid.tei.xml')
len(manual_papers)

In [77]:
def strip_namespace(tag):
    return tag.split('}', 1)[-1] if '}' in tag else tag

def extract_text(element):
    if strip_namespace(element.tag) == 'figure':
        return ""

    text = element.text or ""
    for child in element:
        text += extract_text(child)
        if child.tail:
            text += child.tail
    return text

def extract_body_text(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        body_elements = [elem for elem in root.iter() if strip_namespace(elem.tag) == 'body']
        if body_elements:
            body_texts = [extract_text(body_element).strip() for body_element in body_elements]
            return ' '.join(body_texts)
        else:
            return 'Body element not found'
    except Exception as e:
        return f'Error parsing file: {e}'


In [80]:
# Iterate over each paper_id and extract the body text

body_texts = []

for i, row in manual_papers.iterrows():
    file_path = f'all_pdfs_grobid/{row["grobid"]}'
    body_text = extract_body_text(file_path)
    body_texts.append(body_text)

manual_papers['body_text'] = body_texts

Unnamed: 0,paperid,defined_concept,extracted_definitions,reference_if_def_from_another_paper,explicit_implicit,examples,relevance,notes,grobid,body_text,n_words
1,f6fabad4373992c433101e0d1d2b19901b39ef97,,,,none,,DISCARDED,,paper_f6fabad4373992c433101e0d1d2b19901b39ef97...,IntroductionThere are many cases in which …rms...,10255
3,2f166f9c50a6e18407874ba739c9c7cb1b51d4f7,,,,none,,DISCARDED,spanish language,paper_2f166f9c50a6e18407874ba739c9c7cb1b51d4f7...,"1. ""QUEBRANTO"". LA EXCLUSIÓN EDUCATIVA Quizás ...",8673
4,6756da5a0d5e037a0c143e6418b85633561c927c,,,,none,,DISCARDED,,paper_6756da5a0d5e037a0c143e6418b85633561c927c...,"ReportIn this report, we describe a simple cor...",1689
5,ed0dbdbf192829018592a7be649b2e403cbd92fa,,,,none,,DISCARDED,,paper_ed0dbdbf192829018592a7be649b2e403cbd92fa...,Changes in cell composition underlie diverse p...,7063
6,c9b94df36183d880403fcaaf5ba8b5b74d022de8,,,,none,,DISCARDED,,paper_c9b94df36183d880403fcaaf5ba8b5b74d022de8...,"IntroductionTogether with neurons, glia (astro...",11040
...,...,...,...,...,...,...,...,...,...,...,...
2493,d2bfca01f27526c542c74278da145141e89d7293,,,,none,,DISCARDED,,paper_d2bfca01f27526c542c74278da145141e89d7293...,"IntroductionPlant biostimulants, or agricultur...",16450
2494,de0535f9a2691f331609574920cb17f90a644540,,,,none,,DISCARDED,,paper_de0535f9a2691f331609574920cb17f90a644540...,"Aging, an inevitable biological process, is ch...",6285
2495,e2bda1a9c0c5263b0812a9227460db6b710c9fac,,,,none,,DISCARDED,,paper_e2bda1a9c0c5263b0812a9227460db6b710c9fac...,IntroductionThe first cases of coronavirus dis...,2081
2496,e2e796ccef8c26d5154b2e70feb9582125e6ff2f,,,,none,,DISCARDED,italian paper,paper_e2e796ccef8c26d5154b2e70feb9582125e6ff2f...,INTRODUÇÃOTrabalhos científicos envolvendo sem...,2390


In [83]:
manual_papers.to_csv('manually_labelled.csv')