In [None]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom
import pandas as pd
import os

import re
import numpy as np
import shutil

In [None]:
def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent=" ")

In [None]:
### Function that removes "arrays" from the author fields

def removeAutor(my_str):
    my_str1 = re.sub("\['", "", my_str)
    my_str2 = re.sub("'\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2)
    return(my_str3)

In [None]:
### Function that removes new lines
def removePassage(my_str):
    my_str1 = re.sub("\\\\ud", " ", my_str)
    my_str2 = re.sub("\\\\n", " ", my_str1)
    return(my_str2)

## Extract Core Data

In [None]:
df_meta = pd.read_csv("metadata_15553.csv") # path to metadata (coreID,titles,authors) of final selection of papers
df_meta

In [None]:
# Path to GROBID "createTraining" batch command created TEIs ["Core_ID_xxxxxxx.training.header.tei.xml"]
path_input = "/header_training"
path_output = "/header_training_edited"

In [None]:
# Get file paths and CORE Ids from the GROBID "createTraining" batch command created TEIs
files = []
file_paths = []
core_ids= []
for r, d, f in os.walk(path_input+"/TEI"):
    for file in f:
        files.append(file)
        file_paths.append(os.path.join(r, file))
        ID = file.replace('Core_ID_',"").replace(".training.header.tei.xml",'')
        core_ids.append(ID)

# Write Training TEI data

In [None]:
### Function for labelling of the data

no_autor = []
no_title = []
error_papers = []
result_tokens = []
result_label = []
i = 0

for core_id, file_path in zip(core_ids,file_paths):

    try:
        del end_title
        del beg_title
        del end_author
        del beg_author
    except:
        pass
    tree = ET.parse(file_path)
    root = tree.getroot()
    prettify(root)
    
    tei_string = ET.tostring(root, encoding='utf8').decode('utf8').replace('-<lb />','- ').replace('<lb />-',' -').replace('<lb />','')
    text_tei = re.sub('(<.*?>)|(\\n)|(\\t)',' ',tei_string)
    text_tei = ' '.join(text_tei.split())

    
    # Remove the "front" node
    for elem in root.iter():
        for child in list(elem):
            if child.tag == 'front':
                elem.remove(child)

                
    paper_meta = df_meta.loc[df_meta.coreId == int(core_id)]
    
    title = ' '.join(removePassage(paper_meta.title.iloc[0]).split()).lower()
    
    authors = removeAutor(paper_meta.authors.iloc[0]).split(",")
    autors = []
    for j in range(len(authors)):

        autor_pdf = ' '.join(authors[j].split()) ## Remove excessive Whitespaces
        autors.append(autor_pdf)
    
    if text_tei.lower().find(title) < 0:
        no_title.append(core_id)
    else:
        beg_title = text_tei.lower().find(title)
        end_title = beg_title + len(title)
        range_title = list(range(beg_title,end_title))
    try:

        autors_surname = []
        for i in range(len(autors)):
            if i % 2 == 0:
                autors_surname.append(autors[i])

        autors_surname_lower = []
        for i in range(len(autors_surname)):
            autors_surname_lower.append(autors_surname[i].lower())

        if re.match('.\.',autors[1]) == None:
            autors_forename = []
            for i in range(len(autors)):
                if i % 2 == 1:
                    autors_forename.append(autors[i].split())

            autors_forename = list(np.concatenate((autors_forename), axis=None))
            autors_forename_lower = []
            for i in range(len(autors_forename)):
                autors_forename_lower.append(autors_forename[i].lower())

            autors_surname_lower = list(np.concatenate((autors_forename_lower,autors_surname_lower), axis=None))

        beg_author = []
        end_author = []
        for autor in autors_surname_lower:
            beg = text_tei.lower().find(autor)
            if beg>-1:
                if beg not in beg_author:
                    beg_author.append(text_tei.lower().find(autor))
                    end_author.append(text_tei.lower().find(autor) + len(autor))
                else:
                    while text_tei.lower().find(autor,beg+1)>-1:
                        if text_tei.lower().find(autor,beg+1) in beg_author:
                            beg = 999999999
                            
                        else:
                            beg_author.append(text_tei.lower().find(autor,beg+1))
                            end_author.append(text_tei.lower().find(autor,beg+1) + len(autor))
                            beg = text_tei.lower().find(autor,beg+1)


        if beg_author==[]:
            no_autor.append(core_id)    

        if len(beg_author)>len(autors_surname_lower):
            span_author = dict(zip(beg_author,end_author))
            diff = len(beg_author) - len(autors_surname_lower)
            dist = []
            for p in beg_author:
                if p<beg_title:
                    dist.append(abs(p - beg_title))
                else:
                    dist.append(abs(p - end_title))

            dict1 = dict(zip(dist , beg_author))      
            dist.sort(reverse = False)

            for k in range(len(dist[0:diff])):
                b =[]
                e = []
                b.append(dict1[dist[k]])
                e.append(span_author[dict1[dist[k]]])
                beg_author = b
                end_author = e     
        
        if re.search('.\.',autors[1]) != None:
            for ba,ea in zip(beg_author,end_author):
                if re.search('(.\..\.)|(.\. .\.)|(.\.. \.)|(. \. . \.)',text_tei[ba-7:ba]) != None and any(token.isupper() for token in text_tei[ba-7:ba].split()):
                    span = re.search('(.\..\.)|(.\. .\.)|(.\.. \.)|(. \. . \.)',text_tei[ba-7:ba]).span()
                    beg_author.append(ba-7+span[0])
                    end_author.append(ba-7+span[1])
                if re.search('(.\..\.)|(.\. .\.)|(.\.. \.)|(. \. . \.)',text_tei[ea:ea+7]) != None and any(token.isupper() for token in text_tei[ea:ea+7].split()):
                    span = re.search('(.\..\.)|(.\. .\.)|(.\.. \.)|(. \. . \.)',text_tei[ea:ea+7]).span()
                    beg_author.append(ea + span[0])
                    end_author.append(ea + span[1])
         
        beg_author = list(dict.fromkeys(beg_author))
        end_author = list(dict.fromkeys(end_author))
        beg_author.sort(reverse=False)
        end_author.sort(reverse=False)
        
        indices_author = []
        dist_author= []
        for ba,ea in zip(beg_author[1:],end_author[:len(end_author)-1]):
            dist_author.append(abs(ba-ea))
        if all(dist<6 for dist in dist_author):
            indices_author = [min(beg_author),max(end_author)]
        else:
            indices_author= list(np.concatenate((beg_author,end_author)))
            for ba,ea in zip(beg_author[1:],end_author[:len(end_author)-1]):
                if abs(ba-ea)<6:
                    indices_author.remove(ba)
                    indices_author.remove(ea)
        
        beg_author2 = []
        end_author2 = []
        for i in range(len(indices_author)):
            if i%2==0:
                beg_author2.append(indices_author[i])
            else: 
                end_author2.append(indices_author[i])
        range_authors=[]
        for i in range(len(beg_author2)):
            range_list = list(range(beg_author2[i],end_author2[i]))
            range_authors = list(np.concatenate((range_authors,range_list)))
            
        indices = indices_author
        indices.append(beg_title)
        indices.append(end_title)
        indices.sort(reverse = False)
        parts = [text_tei[i:j] for i,j in zip(indices, indices[1:]+[None])]
        
        text = root[1]
        front = SubElement(text , 'front')

        final_string = ''
        if min(indices)!=0:
            final_string+=text_tei[0:min(indices)]

        for i in range(len(parts)):
#             print(parts[i])
            if parts[i].lower()== title.lower():
                final_string+='<docTitle><titlePart>'+ parts[i] +'</titlePart></docTitle>'
            elif indices[i] in range_authors: 
                final_string+="<byline><docAuthor> "+ parts[i] + "</docAuthor></byline>" 
            else:
                final_string+=parts[i]
        front.text = final_string
        file_name = path_output + "/TEI/Core_ID_" + str(core_id) +".training.header.tei.xml"
        mydata = str(prettify(root)).replace("&lt;","<").replace("&gt;",">")
        myfile = open(file_name, "w",encoding="utf-8")
        myfile.write(mydata)
        myfile.close()
        
        shutil.copyfile(path_input+"/HEADER/Core_ID_" + str(core_id) + ".training.header",
                        path_output+ "/HEADER/Core_ID_" + str(core_id) + ".training.header")
    except:
        i=i+1
        print("SEVERE ERROR "+ str(core_id)+ "\n"+str(i))