# Meta Data Matching
## Import packages

In [None]:
# read csv
import pandas as pd
import os
# PdfMiner
import glob
import numpy as np
from io import StringIO
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

########################################
from random import random
from numpy import array
from numpy import cumsum
########################################
import regex as re  
import string
import re
########################################
from datetime import datetime
import collections
########################################


## Functions

In [None]:
### Function that remove new lines
def removePassage(my_str):
    my_str1 = re.sub("\\\\ud", " ", my_str)
    my_str2 = re.sub("\\\\n", " ", my_str1)
    return(my_str2)

### Function that parse the first page of a PDF
def extract_page_one(path):
    output_string = StringIO()
    
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(list(PDFPage.create_pages(doc))[0])
        return(output_string)
    
### Function that remove "arrays" from the author fields
def removeAutor(my_str):
    my_str1 = re.sub("\['", "", my_str)
    my_str2 = re.sub("'\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2)
    return(my_str3)

### "Chronological left join" for adding labelled new line tokens ("NEWLINE")
def add_newlines(Tokens,Real_Tokens,y_final):
    y_final_REAL = []
    k = 0
    m = 0
    for i in range(len(Tokens)):
        if k == 0:
            j=i
        else:
            if m == 0:
                j = k+1
            else:
                j = m+1
        if Tokens[i] == Real_Tokens[j] : # If tokens are the same, then take y_final_REAL
            y_final_REAL.append(y_final[i])
            m = j
        else:
            for k in range(j,len(Real_Tokens)): # Else go through Real_Tokens until there is a match

                if Real_Tokens[k] == 'NEWLINE':
                    y_final_REAL.append('miscellaneous')
                

                else:
                    y_final_REAL.append(y_final[i])
                    m=k
                    break

    RealTokens_final = Real_Tokens[:len(y_final_REAL)]
    
    index_title = [i for i, e in enumerate(y_final_REAL) if e == 'I-title']
    end_title = max(index_title)
    
    ### lable NEWLINES in title as "I-title"          
    for i in range(len(RealTokens_final)):
        if RealTokens_final[i]=='NEWLINE':
            if (y_final_REAL[i+1] =='I-title' or end_title>i) and y_final_REAL[i-1] in ('B-title','I-title'):
                y_final_REAL[i] = 'I-title'
    ## It is possible that several NEWLINES come after each other. Therefore we use "end_title>i" to determine, if there is a title label afterwards in the vector.                    
    return(RealTokens_final,y_final_REAL)

#Create Word Vector representation
def detect_and_vectorize(tokens_sequence): #### input is the tokens list of ONE PAPER e.g. result_tokens[2]
    
    tokens_vectorized = []
    lang = detect(' '.join(tokens_sequence))
    
    if (lang == 'ru'):
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_ru.get_word_vector(tokens_sequence[i])))
                
    elif (lang == 'bg'):
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_bg.get_word_vector(tokens_sequence[i])))
                
    else:  ## assume language == uk
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_uk.get_word_vector(tokens_sequence[i])))
    
    while len(tokens_vectorized)<1000:
        tokens_vectorized.append(np.zeros(60))
      
    if len(tokens_vectorized)>1000:
        del tokens_vectorized[1000:] 

    return np.array(tokens_vectorized)

### Additional Features
punctuations = '''!()[]{};:'"\<>/?@#$%^&*«»_–~.,-'''

def compute_additional_features(tokens_sequence): #### input is the tokens list of ONE PAPER e.g. result_tokens[2]
    
    tokens = tokens_sequence
    feature_upper = []
    feature_capitalized = []
    feature_author_format = []
    feature_punctation = []
    feature_newline = []
    feature_array = []
    
    while len(tokens)<1000:
        tokens.append(str(0))
    if len(tokens)>1000:
        del tokens[1000:] 
    #print(tokens)    
    for i in range(len(tokens)):
        if tokens[i] !='NEWLINE':
            if str(tokens[i]).isupper():
                feature_upper.append(1)

            else:
                feature_upper.append(0)
        else: 
            feature_upper.append(0)

        if tokens[i] !='NEWLINE':
            if str(tokens[i][0]).isupper():
                feature_capitalized.append(1)

            else:
                feature_capitalized.append(0)
        else: 
            feature_capitalized.append(0)

        if tokens[i] !='NEWLINE':
            if re.match('.\.',str(tokens[i])) != None and str(tokens[i]).isupper():
                feature_author_format.append(1)

            else:
                feature_author_format.append(0)
        else: 
            feature_author_format.append(0)

        if tokens[i] !='NEWLINE':
            if any((c in punctuations) for c in str(tokens[i])):
                feature_punctation.append(1)
            else:
                feature_punctation.append(0)
        else: 
            feature_punctation.append(0)
                
        if tokens[i] =='NEWLINE':
            feature_newline.append(1)
        else: 
            feature_newline.append(0)
    df = pd.DataFrame(list(zip(feature_upper, feature_capitalized,feature_author_format ,feature_punctation,feature_newline)))  
    feature_array = df.to_numpy(copy=True)
    
    return np.array(feature_array)

## Read Meta Data

In [None]:
### Import Meta Data
path_meta="D:/final_items_15553.csv"
df_meta = pd.read_csv(path_meta, sep = ',')#  , encoding= 'utf-16')
# df_meta.drop('Unnamed: 0' , axis = 1 , inplace=True)
df_meta

## Read PDF

In [None]:
path= "D:/PDFs_15553"

In [None]:
files = []
file_paths = []
print('Reading directory: '+str(path))     # r=root directory d=directories f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.pdf' in file:
            files.append(file)
            file_paths.append(os.path.join(r, file))

In [None]:
### Get Core IDs of Files in directory
files_core_id = []
for i in range(len(files)):
    files_core_id.append(int(re.sub('.pdf','',re.sub('Core_ID_','', str(files[i])))))
    
len(files_core_id)

In [None]:
### extract text of 15.553 PDFs (first page)
## output the progess in percent every 100 items

all_pdf_text = [] 
lauf=0
start_time = datetime.now()
for i in range(len(files)):

    all_pdf_text.append(extract_page_one(file_paths[i]).getvalue())
    if lauf%100==0:
        print(str((i/len(files))*100)+'%')
    lauf=lauf+1
    
end_time = datetime.now()

In [None]:
time_delta = (end_time - start_time)
seconds_all = time_delta.total_seconds()
minutes_all = seconds_all/60
print('Time duration for text extractions: '+str(minutes_all)+' min.')

In [None]:
### get all titles from meta data with core_ids of fulltext
titles = []
for i in range(len(files_core_id)):
    index = df_meta.index[df_meta['coreId'] == int(files_core_id[i])].tolist()
    if index == []:
        titles.append('No metadata found')
    else: 
        index = index[0]
        title_pdf  = df_meta.loc[index,'title']
        titles.append(title_pdf)
len(titles)

In [None]:
### Get author for the PDF´s

authors = []
for i in range(len(files_core_id)):
    index = df_meta.index[df_meta['coreId'] == int(files_core_id[i])].tolist()
    index = index[0]
    author_pdf  = df_meta.loc[index,'authors']

    author_pdf = removeAutor(author_pdf).split(",")
    for j in range(len(author_pdf)):
        author_pdf[j] = ' '.join(author_pdf[j].split()) ## remove excessive whitespaces
        
    authors.append(author_pdf)
len(authors)

## Matching

In [None]:
### function that labels the data 

no_author = []
no_title = []
error_papers = []
result_tokens = []
result_label = []
count_papers = len(files_core_id)

for paper in range(count_papers):
    title = ' '.join(removePassage(titles[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title = re.sub("\(","\(",title) # () as non-regex string
    title = re.sub("\)","\)",title)

    title_index = re.search(title, ' '.join(all_pdf_text[paper].split()).lower())

    
    if title_index==None:
        no_title.append(files_core_id[paper])
    else:
        try:
            Text_pdf_0 = ' '.join(all_pdf_text[paper].split())
            if title_index.start()==0:
                teil_B = ""
            else:
                teil_B = Text_pdf_0[0:title_index.start()-1]
            teil_T = Text_pdf_0[title_index.start():title_index.end()]
            teil_E = Text_pdf_0[title_index.end()+1:len(Text_pdf_0)]

            y_teil1 = np.repeat('miscellaneous',len(teil_B.split()))
            y_teil2 = np.append(['B-title'],np.repeat('I-title',len(teil_T.split())-1))
            y_teil3 = np.repeat('miscellaneous',len(teil_E.split()))

            y_final = np.concatenate((y_teil1, y_teil2 , y_teil3), axis=None)

            ##### Get Text
            all_pdf_text1 = re.sub("\\n"," NEWLINE ",all_pdf_text[paper])
            Text_pdf_0_NL = ' '.join(all_pdf_text1.split())

            Tokens = Text_pdf_0.split()
            Labels = y_final
            Real_Tokens = Text_pdf_0_NL.split()

            authors_surname = []
            for i in range(len(authors[paper])):
                if i % 2 == 0:
                    authors_surname.append(authors[paper][i])

            authors_surname_lower = []
            for i in range(len(authors_surname)):
                authors_surname_lower.append(authors_surname[i].lower())
        
            if re.match('.\.',authors[paper][1]) == None:
                authors_forename = []
                for i in range(len(authors[paper])):
                    if i % 2 == 1:
                        authors_forename.append(authors[paper][i].split())

                authors_forename = list(np.concatenate((authors_forename), axis=None))
                authors_forename_lower = []
                for i in range(len(authors_forename)):
                    authors_forename_lower.append(authors_forename[i].lower())

                authors_surname_lower = list(np.concatenate((authors_forename_lower,authors_surname_lower), axis=None))
    
            Tokens = all_pdf_text[paper].split()
            Tokens_final_lower = []
            for i in range(len(Tokens)):
                Tokens_final_lower.append(Tokens[i].lower())

            vec_author = []
            for token in Tokens_final_lower:
                line = any(word in token for word in authors_surname_lower)
                vec_author.append(line)

            index_author = [i for i, e in enumerate(vec_author) if e == True]

            if len(index_author)>(len(authors_surname_lower)):
                diff = len(index_author) - len(authors_surname_lower)
                dist = []
                for j in range(len(index_author)):
                    dist.append(abs(index_author[j]-y_final_REAL.index('B-title')))

                dict1 = dict(zip(dist , index_author))

                dist.sort(reverse = True)

                for k in range(len(dist[0:diff])):
                    vec_author[dict1[dist[0:diff][k]]] = False

            for i in range(len(y_final)):
                if vec_author[i] == True:
                    y_final[i] = 'author'

            if True not in vec_author:
                no_author.append(files_core_id[paper])
                
            if re.match('.\.',authors[paper][1]) != None:

                index_author_true = [i for i, e in enumerate(vec_author) if e == True]

                for w in range(len(index_author_true)):
                    index = index_author_true[w]
                    for t in range(index - 4,index + 4):
                        if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                            y_final[t] = 'author'

            RealTokens_final = add_newlines(Tokens,Real_Tokens,y_final)[0]
            y_final_REAL = add_newlines(Tokens,Real_Tokens,y_final)[1]

            result_label.append(y_final_REAL)
            result_tokens.append(RealTokens_final)
        except:
            error_papers.append(files_core_id[paper])
            

## Labled Text

In [None]:
path_output = "D:\human_readable_xml"

In [None]:
run=0
start_time = datetime.now()
for i in range(len(files_core_id)):

    output_text_token = []
    for token, label in zip(result_tokens[i],result_label[i]):
        if label != "miscellaneous":
            output_text_token.append(f"<{label}>" + token + f"</{label}>")
        else:
            output_text_token.append(token)

    output_text = ' '.join(output_text_token) 
    myfile = open(path_output + "\\"+str(files_core_id[i])+".xml", "w",encoding="utf-8")
    myfile.write(output_text)
    myfile.close()
    if run%100==0:
        print(str((i/len(files_core_id))*100)+'%')
    run=run+1

#     print(output_text)
end_time = datetime.now()