# Import Packages & Define functions

In [None]:
# read csv
import pandas as pd
import os
# PdfMiner
import glob
import numpy as np
from io import StringIO
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
# Token Vectorization
from langdetect import detect 
import fasttext.util
import fasttext
########################################
from random import random
from numpy import array
from numpy import cumsum
########################################
import regex as re  
import string
import re
########################################
from datetime import datetime
import collections
########################################
# Keras imports for ML Model
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import math

from dataclasses import dataclass
from bs4 import BeautifulSoup

In [None]:
def read_tei(tei_file):
    with open(tei_file, 'r',encoding="utf-8") as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')
    
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default
    
@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str
    
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text

In [None]:
# Function that get the fullpath of files in a directory
def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

### Function that remove new lines
def removePassage(my_str):
    my_str1 = re.sub("\\\\ud", " ", my_str)
    my_str2 = re.sub("\\\\n", " ", my_str1)
    return(my_str2)

### Function that parse the first page of a PDF
def extract_page_one(path):
    output_string = StringIO()
    
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(list(PDFPage.create_pages(doc))[0])
        return(output_string)
    


### "Chronological left join" for adding labelled new line tokens ("NEWLINE")
def add_newlines(Tokens,Real_Tokens,y_final):
    y_final_REAL = []
    k = 0
    m = 0
    for i in range(len(Tokens)):
        if k == 0:
            j=i
        else:
            if m == 0:
                j = k+1
            else:
                j = m+1
        if Tokens[i] == Real_Tokens[j] : # If tokens are the same, then take y_final_REAL
            y_final_REAL.append(y_final[i])
            m = j
        else:
            for k in range(j,len(Real_Tokens)): # Else go through Real_Tokens until there is a match

                if Real_Tokens[k] == 'NEWLINE':
                    y_final_REAL.append('miscellaneous')

                else:
                    y_final_REAL.append(y_final[i])
                    m=k
                    break

    RealTokens_final = Real_Tokens[:len(y_final_REAL)]
    ## It is possible that several NEWLINES come after each other. Therefore we use "end_title>i" to determine, if there is a title label afterwards in the vector.        
    index_title = [i for i, e in enumerate(y_final_REAL) if e == 'I-title']
    if index_title==[]:
        return(RealTokens_final,y_final_REAL)
    else:
        end_title = max(index_title)

        ### lable NEWLINES in title as "I-title"        
        for i in range(len(RealTokens_final)):
            if RealTokens_final[i]=='NEWLINE':
                if (y_final_REAL[i+1] =='I-title' or end_title>i) and y_final_REAL[i-1] in ('B-title','I-title'):
                    y_final_REAL[i] = 'I-title'
        ## It is possible that several NEWLINES come after each other. Therefore we use "end_title>i" to determine, if there is a title label afterwards in the vector.        

        return(RealTokens_final,y_final_REAL)


### Grobid: remove [[...]]
### Function that remove "arrays" from the author fields
def removeAutor_grobid(my_str):
    my_str1 = re.sub("\[\['", "", my_str) 
    my_str2 = re.sub("'\]\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2) 
    return(my_str3)

### Function that remove "arrays" from the author fields
def removeAutor(my_str):
    my_str1 = re.sub("\['", "", my_str)
    my_str2 = re.sub("'\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2)
    return(my_str3)


# Read TEI Files

In [None]:
### DO NOT RUN! Predictions of GROBID are already saved in 'grobid_16467.csv'
### Transform the tei.xml files into a dataframe


all_files = listdir_fullpath("/_final_selection_16478/GROBID_ALL")


Frame = pd.DataFrame({"core_id": [] , "title": [] , "authors": [] })
i = 0
for tei_doc in all_files:
    tei = TEIFile(tei_doc)
    core_id = re.sub(".tei.xml","",re.sub("D:\\\\_final_selection_16478\\\\GROBID_ALL\\\\Core_ID_","",tei_doc))
    authors = []
    for i in range(len(tei.authors)):
        if len(tei.authors[i].firstname)==1:
            forename = tei.authors[i].firstname + ". " + tei.authors[i].middlename + "."
            surname = tei.authors[i].surname
            name = [forename , surname]
        elif len(tei.authors[i].middlename)==1:
            forename = tei.authors[i].firstname +" " +tei.authors[i].middlename + "."
            surname = tei.authors[i].surname
            name = [forename , surname]
        else:
            forename = tei.authors[i].firstname + " " +tei.authors[i].middlename
            surname = tei.authors[i].surname
            name = [forename , surname]
        authors.append(name)
    
    
    Frame = Frame.append(pd.DataFrame(data = {"core_id": core_id , "title":tei.title , "authors":str(authors)},index = [i]), ignore_index=True)
    i =+ 1

Frame.to_excel("grobid_16467.xlsx")
Frame.to_csv('grobid_16467.csv')
Frame

# Evaluate GROBID prediction

In [None]:
df_grobid_pred = pd.read_csv('grobid_16467.csv', sep = ',')#  , encoding= 'utf-16')
df_grobid_pred.drop('Unnamed: 0' , axis = 1 , inplace=True)
df_grobid_pred

In [None]:
path_9620 = "D:/PDF_Grobid_train_eval/"

pdf = os.listdir(path_9620)


train_files_core_id = []
train_files_paths = []
for elem in pdf:
    core = int(elem.replace(".pdf","").replace("Core_ID_",""))
    if core in list(df_grobid_pred.core_id):
        train_files_core_id.append(core)
#         files_paths.append("D:/_final_selection_16478/all_pdf/" + elem)
        train_files_paths.append(path_9620 + "Core_ID_"+str(core)+".pdf")
print(len(train_files_core_id))
print(train_files_core_id[0])
print(train_files_paths[0])

In [None]:
df_grobid_pred = df_grobid_pred[df_grobid_pred.core_id.isin(train_files_core_id)].reset_index()
df_grobid_pred.drop('index' , axis = 1 , inplace=True)
print(df_grobid_pred.shape)
print("{} titles and {} authors are NA".format(sum(df_grobid_pred.title.isna()),sum(df_grobid_pred.authors == "[]")))

In [None]:
### extract text 

all_pdf_text = [] 
start_time = datetime.now()
for i in range(len(train_files_paths)):
    try:
        all_pdf_text.append(extract_page_one(train_files_paths[i]).getvalue())
        if i % 500 == 0:
            print(str((i/len(train_files_paths))*100)+'%')
    except:
        all_pdf_text.append("Not readable")
        print("ERROR")
    
end_time = datetime.now()

In [None]:
### Import REAL Meta Data

df_meta_real = pd.read_csv('metadata_15553.csv', sep = ',')#  , encoding= 'utf-16')
# df_meta.drop('Unnamed: 0' , axis = 1 , inplace=True)
# df_meta_real

### get all titles from meta data with core_ids of fulltext
titles_real = []
for i in range(len(train_files_core_id)):
    index = df_meta_real.index[df_meta_real['coreId'] == int(train_files_core_id[i])].tolist()
    if index == []:
        titles_real.append('metadata not found')
    else: 
        index = index[0]
        title_pdf  = df_meta_real.loc[index,'title']
        titles_real.append(title_pdf)
len(titles_real)

### Get authors for the PDF´s
authors_real = []
for i in range(len(train_files_core_id)):
    index = df_meta_real.index[df_meta_real['coreId'] == int(train_files_core_id[i])].tolist()
    index = index[0]
    author_pdf  = df_meta_real.loc[index,'authors']

    author_pdf = removeAutor(author_pdf).split(",")
    for j in range(len(author_pdf)):
        author_pdf[j] = ' '.join(author_pdf[j].split()) ## remove excessive whitespaces (auch am Anfang)
        
    authors_real.append(author_pdf)
len(authors_real)

# Get Label vectors for the GROBID Prediction

In [None]:
### get all titles from grobid predictions with core_ids of fulltext
titles = []
for i in range(len(df_grobid_pred.core_id)):
    index = df_grobid_pred.index[df_grobid_pred['core_id'] == int(df_grobid_pred.core_id[i])].tolist()
    if df_grobid_pred.title.isna()[i]:
        titles.append('title was not predicted')
    else: 
        index = index[0]
        title_pdf  = df_grobid_pred.loc[index,'title']
        titles.append(title_pdf)

len(titles)

In [None]:
### Get author for the PDF´s

authors = []
for i in range(len(df_grobid_pred.core_id)):
    index = df_grobid_pred.index[df_grobid_pred['core_id'] == int(df_grobid_pred.core_id[i])].tolist()
    index = index[0]
    author_pdf  = df_grobid_pred.loc[index,'authors']

    author_pdf = removeAutor_grobid(author_pdf).split(",")
    for j in range(len(author_pdf)):
        author_pdf[j] = ' '.join(author_pdf[j].split()) ## remove excessive whitespace
        
    authors.append(author_pdf)
len(authors)

In [None]:
### function for data labelling

no_author = []
no_title = []
error_papers = []
result_tokens = []
result_label = []
### Real Meta
result_label_real = []
###

count_papers = len(df_grobid_pred.core_id)

for paper in range(count_papers):

    title = ' '.join(removePassage(titles[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title = re.sub("\(","\(",title) # () as non-regex string
    title = re.sub("\)","\)",title)
    title = re.sub("\*","\*",title) # * as non-regex string

    title_index = re.search(title, ' '.join(all_pdf_text[paper].split()).lower()) # search for the title
    
    #### Real Meta
    title_real = ' '.join(removePassage(titles_real[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title_real = re.sub("\(","\(",title_real) # () as non-regex string
    title_real = re.sub("\)","\)",title_real)
    title_real = re.sub("\*","\*",title_real) # * as non-regex string

    title_index_real = re.search(title_real, ' '.join(all_pdf_text[paper].split()).lower()) # search for the title
    ######
    
    print('CoreID:  ' + str(train_files_core_id[paper]))

    if title_index==None:
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())
        
        no_title.append(df_grobid_pred.core_id[paper])
        y_final= np.repeat('miscellaneous',len(Text_pdf_0.split()))
    else:
        
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())

        ##### TITLE ################################################
        if title_index.start()==0:
            teil_B = ""
        else:
            teil_B = Text_pdf_0[0:title_index.start()-1]
        teil_T = Text_pdf_0[title_index.start():title_index.end()]
        teil_E = Text_pdf_0[title_index.end()+1:len(Text_pdf_0)]

        y_teil1 = np.repeat('miscellaneous',len(teil_B.split()))
        y_teil2 = np.append(['B-title'],np.repeat('I-title',len(teil_T.split())-1))
        y_teil3 = np.repeat('miscellaneous',len(teil_E.split()))

        y_final = np.concatenate((y_teil1, y_teil2 , y_teil3), axis=None)
        
    ### Real Meta    
    if title_index_real!=None:
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())

        ##### TITLE ################################################
        if title_index_real.start()==0:
            teil_B_real = ""
        else:
            teil_B_real = Text_pdf_0[0:title_index_real.start()-1]
        teil_T_real = Text_pdf_0[title_index_real.start():title_index_real.end()]
        teil_E_real = Text_pdf_0[title_index_real.end()+1:len(Text_pdf_0)]

        y_teil1_r = np.repeat('miscellaneous',len(teil_B_real.split()))
        y_teil2_r = np.append(['B-title'],np.repeat('I-title',len(teil_T_real.split())-1))
        y_teil3_r = np.repeat('miscellaneous',len(teil_E_real.split()))

        y_final_real = np.concatenate((y_teil1_r, y_teil2_r , y_teil3_r), axis=None)
    ###
    
    ##### Get Text
    all_pdf_text1 = re.sub("\\n"," NEWLINE ",all_pdf_text[paper])
    Text_pdf_0_NL = ' '.join(all_pdf_text1.split())

    Tokens = Text_pdf_0.split()
    Labels = y_final
    Real_Tokens = Text_pdf_0_NL.split()

    Tokens = all_pdf_text[paper].split()

    Tokens_final_lower = []
    for i in range(len(Tokens)):
        Tokens_final_lower.append(Tokens[i].lower())
    try:
        if authors[paper]!= ['[]']:

            authors_surname = []
            for i in range(len(authors[paper])):
                if i % 2 == 0:
                    authors_surname.append(authors[paper][i])

            authors_surname_lower = []
            for i in range(len(authors_surname)):
                authors_surname_lower.append(authors_surname[i].lower())

            if re.match('.\.',authors[paper][1]) == None:
                authors_forename = []
                for i in range(len(authors[paper])):
                    if i % 2 == 1:
                        authors_forename.append(authors[paper][i].split())

                authors_forename = list(np.concatenate((authors_forename), axis=None))
                authors_forename_lower = []
                for i in range(len(authors_forename)):
                    authors_forename_lower.append(authors_forename[i].lower())

                authors_surname_lower = list(np.concatenate((authors_forename_lower,authors_surname_lower), axis=None))


            vec_author = []
            for token in Tokens_final_lower:
                line = any(word in token for word in authors_surname_lower)
                vec_author.append(line)

            index_author = [i for i, e in enumerate(vec_author) if e == True]

            if title_index!=None:
                if len(index_author)>(len(authors_surname_lower)):
                    diff = len(index_author) - len(authors_surname_lower)
                    dist = []
                    for j in range(len(index_author)):
                        dist.append(abs(index_author[j]-np.where(y_final=="B-title")[0][0]))

                    dict1 = dict(zip(dist , index_author))

                    dist.sort(reverse = True)

                    for k in range(len(dist[0:diff])):
                        vec_author[dict1[dist[0:diff][k]]] = False

            for i in range(len(y_final)):
                if vec_author[i] == True:
                    y_final[i] = 'author'

            if True not in vec_author:
                no_author.append(train_files_core_id[paper])

            if re.match('.\.',authors[paper][1]) != None:

                index_author_true = [i for i, e in enumerate(vec_author) if e == True]

                for w in range(len(index_author_true)):
                    index = index_author_true[w]
                    for t in range(index - 4,index + 4):
                        if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                            y_final[t] = 'author'
        ### Real Meta
        authors_surname_real = []
        for i in range(len(authors_real[paper])):
            if i % 2 == 0:
                authors_surname_real.append(authors_real[paper][i])

        authors_surname_lower_real = []
        for i in range(len(authors_surname_real)):
            authors_surname_lower_real.append(authors_surname_real[i].lower())

        if re.match('.\.',authors_real[paper][1]) == None:
            authors_forename_real = []
            for i in range(len(authors_real[paper])):
                if i % 2 == 1:
                    authors_forename_real.append(authors_real[paper][i].split())

            authors_forename_real = list(np.concatenate((authors_forename_real), axis=None))
            authors_forename_lower_real = []
            for i in range(len(authors_forename_real)):
                authors_forename_lower_real.append(authors_forename_real[i].lower())

            authors_surname_lower_real = list(np.concatenate((authors_forename_lower_real,authors_surname_lower_real), axis=None))


        vec_author_real = []
        for token in Tokens_final_lower:
            line_real = any(word in token for word in authors_surname_lower_real)
            vec_author_real.append(line_real)

        index_author_real = [i for i, e in enumerate(vec_author_real) if e == True]

        if title_index_real!=None:
            if len(index_author_real)>(len(authors_surname_lower_real)):
                diff = len(index_author_real) - len(authors_surname_lower_real)
                dist = []
                for j in range(len(index_author_real)):
                    dist.append(abs(index_author_real[j]-np.where(y_final_real=="B-title")[0][0]))

                dict1 = dict(zip(dist , index_author_real))

                dist.sort(reverse = True)

                for k in range(len(dist[0:diff])):
                    vec_author_real[dict1[dist[0:diff][k]]] = False

        for i in range(len(y_final_real)):
            if vec_author_real[i] == True:
                y_final_real[i] = 'author'


        if re.match('.\.',authors_real[paper][1]) != None:

            index_author_true_real = [i for i, e in enumerate(vec_author_real) if e == True]

            for w in range(len(index_author_true_real)):
                index = index_author_true_real[w]
                for t in range(index - 4,index + 4):
                    if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                        y_final_real[t] = 'author'
        ###

        RealTokens_final = add_newlines(Tokens,Real_Tokens,y_final)[0]
        y_final_REAL = add_newlines(Tokens,Real_Tokens,y_final)[1]
        ### Real Meta
        y_final_REAL2 = add_newlines(Tokens,Real_Tokens,y_final_real)[1]
        result_label_real.append(y_final_REAL2)
        ###
        result_label.append(y_final_REAL)
        result_tokens.append(RealTokens_final)
    except:
        error_papers.append(train_files_core_id[paper])

In [None]:
print(len(no_author))
print(len(no_title))
print(len(error_papers))

print(len(result_label))
print(len(result_tokens))
print(len(result_label_real))

# Evaluation of GROBID

In [None]:
all_precision = []
all_recall = []
for i in range(len(result_tokens)):

    y_true = [int(t.replace("B-title",'0').replace("I-title",'1').replace("author",'2').replace("miscellaneous",'3')) for t in result_label_real[i]]
    y_pred = [int(t.replace("B-title",'0').replace("I-title",'1').replace("author",'2').replace("miscellaneous",'3')) for t in result_label[i]]
    # target_names = ['class 0', 'class 1', 'class 2']
    #Get the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    #array([[1, 0, 0],
    #   [1, 0, 0],
    #   [0, 1, 2]])
    true_pos = cm.diagonal()

    false_pos = np.sum(cm, axis=0) - true_pos
    false_neg = np.sum(cm, axis=1) - true_pos


    precision = (true_pos / (true_pos + false_pos))
    
    precision2 = []
    for p in precision:
        if math.isnan(p):
            precision2.append(0)
        else:
            precision2.append(p)
    if len(precision2)!=4:
        precision2.append(0)
    recall = (true_pos / (true_pos + false_neg))
    
    recall2 = []
    for r in recall:
        if math.isnan(r):
            recall2.append(0)
        else:
            recall2.append(r)
    if len(recall2)!=4:
        recall2.append(0)
    all_precision.append(np.array(precision2))
    all_recall.append(np.array(recall2))
    
pm = np.array(all_precision)
rm = np.array(all_recall)

precision = pm.sum(axis=0)/len(pm)
recall = rm.sum(axis=0)/len(rm)
f1_score = 2*(precision*recall)/(precision+recall)
Classes = ["B-title","I-title","author","miscellaneous"]
for c,p,r,f in zip(Classes,precision,recall,f1_score):
    print("Scores for label {}: \nPrecision ==> {} \nRecall ==> {} \nF1-Score ==> {}\n".format(c,p,r,f))
    
overall_precison = np.mean(precision[:3])
overall_recall = np.mean(recall[:3])
overall_f1_score = np.mean(f1_score[:3])

print("Scores over all classes (except \"miscellaneous\"):\nPrecision ==> {} \nRecall ==> {} \nF1-Score ==> {}\n".format(overall_precison,overall_recall,overall_f1_score))