In [1]:
# Roher Parser (vgl. auch die ggf. ausführlicher annotierte Datei Parser.ipynb)

import torch
import pandas as pd
torch.cuda.is_available()
from transformers import pipeline
import re
import string
import spacy
import requests

def custom_strip(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace + "“" + "”"
    for character in replaceCharacter:
        allowed_chars = allowed_chars.replace(character, '')
    return text.strip(allowed_chars)


def getIndexOfSubstring(text, regEx = [], reverse = False):
    #if reverse = False then it finds the first occurance of a given regEx.
    #if reverse = True, then it finds the last occurance of a given regEx.
    #beceause the occurance with the max length is taken, it always chooses the regex that covers the most letters
    length = 0
    matches = []
    substring = ""
    #print(f'regEx: {regEx}')
    #print(f'text: {text}')
    for regExElement in regEx:
        matches = list(re.finditer(regExElement, text))
        #print(f'matches: {matches}')
        if matches:
            if reverse:
                match = matches[-1]
            else:
                match = matches[0]
            buffer = match.end() - match.start()
            if buffer > length:
                length = buffer
                startIndex = match.start()
                endIndex = match.end()
                substring = text[match.start():match.end()]
    if substring != "":
        return startIndex, endIndex, substring   
    return -1, -1, substring

def replaceSubstring (startIndex, endIndex, text, substituteString, ignorePunctuation = ["&", "(", ")"]):
    #The regex also checks for punctuation so that it is particularly precise. 
    #The cut text however should in normale mode be without the front delimiter of the bibTex fields in the bibiography, 
    #so that future regex are not affected. But the last delimiter belongs to the cut word so this should be removed
    if endIndex > 0:
        startIndexReplace = 0
        endIndexReplace = 0
        if startIndex > 0:
            for i in range(startIndex, -1, -1):
                if isSpeceficPunctuation(text[i], ignorePunctuation):
                    startIndexReplace = i + 1
                    break
        else:
            startIndexReplace = 0            
        #print(f' replaceSubstring, startIndexReplace={{{startIndexReplace}}}')
        if endIndex < len(text):
            for i in range(endIndex-1, len(text), 1):
                if isSpeceficPunctuation(text[i], ignorePunctuation):
                    endIndexReplace = i + 1
                    break
                elif i == len(text)-1:
                    endIndexReplace = len(text)
        else:
            endIndexReplace = len(text)
        #print(f' replaceSubstring, endIndexReplace={{{endIndexReplace}}}')
        if endIndexReplace > 0:
            changedText = text[0:startIndexReplace] + substituteString + text[endIndexReplace:len(text)]
            return changedText, text[startIndexReplace:endIndexReplace]
    return text, ""

def is_SurenameFirst(names):
    splitedNames = names.split(" ")
    #print(f'is_SurenameFirst: {splitedNames}')
    #regex wie w+ erkennt bspw. KEIN è 
    if splitedNames[0].endswith("."):
        return True
    splitedNames = names.split(",")
    if all(" " in item.strip() for item in splitedNames):
        return True
    return False
    
def is_NameShortened(df_PER):
    for index in df_PER.index.values.tolist():
        if "." == text[df_PER["end"].iloc[index]] and len(text[df_PER["start"].iloc[index]:df_PER["end"].iloc[index] + 1]) == 2:
            return True
    return False


def isSpeceficPunctuation(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace
    for character in replaceCharacter:
        allowed_chars = allowed_chars.replace(character, '')
    return all(char in allowed_chars for char in text)

def is_Editor(editorRegEx, textBetweenNames, startIndexTextBetweenNames, markerBehind = True):
    startSubstring, endSubstring, substring = getIndexOfSubstring(textBetweenNames, [editorRegEx])
    if startIndexTextBetweenNames > -1 and markerBehind:
        if isSpeceficPunctuation(textBetweenNames[startIndexTextBetweenNames:startSubstring], ["&"]):
            return True, startSubstring + startIndexTextBetweenNames, endSubstring + startIndexTextBetweenNames
    elif startIndexTextBetweenNames > -1:
        if isSpeceficPunctuation(textBetweenNames[endSubstring:startIndexTextBetweenNames], ["&"]):
            return True, startSubstring + startIndexTextBetweenNames, endSubstring + startIndexTextBetweenNames
    return False, -1, -1

def processNames(authors):
    finalAuthors = ""
    search_terms = [" and ", ", and ", " & ", ", & "]
    surenameFirst = is_SurenameFirst(authors.strip())
    authors = custom_strip(authors)
    #print(f"processNames: {authors}")
    if surenameFirst:
        startIndex, endIndex, andInAuthors = getIndexOfSubstring(authors, search_terms)
        #print("Fall surenameFirst".format(authors))
        #hier völlig egal, ob er einzelne Initialen in ein eigenes Word gesteckt hat, obwohl es noch Nachnamen gib
        if startIndex >= 0:
            authors = authors.replace(andInAuthors, " and ")
            #print(f'authors: {authors}')
            finalAuthors = authors.replace(", ", " and ")
        else:
            finalAuthors = authors
    elif "., " in authors:
        #print("Fall ., {0}".format(authors))
        search_terms = ["., and ", "., & ", ". and ", ". & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        authors = authors.replace("., ", "#., ")
        if andInAuthors != "":
            authors = authors.replace(andInAuthors, "#., ")
        authors = authors.split("., ")
        authors = [name.replace("#",".") for name in authors]
        authors = [name.replace("..",".") for name in authors]
        for author in authors[:-1]:
            buffer = author.split(", ")
            finalAuthors = finalAuthors + buffer[1] + " " + buffer[0] + " and "
        buffer = authors[-1].split(", ")
        finalAuthors = finalAuthors + buffer[1] + " " +  buffer[0]
    elif ", " in authors:
        #print("Fall , {0}".format(authors))
        search_terms = [", and ", ", & ", " and ", " & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":  
            authors = authors.replace(andInAuthors, ", ")
        authors = authors.split(", ")
        for i in range(0, len(authors) - 3, 2):
            finalAuthors = finalAuthors + authors[i+1] + " " + authors[i] + " and "
        finalAuthors = finalAuthors + authors[len(authors) - 1] + " " + authors[len(authors) - 2]
    return custom_strip(finalAuthors)

def getAuthors(text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    
    onlyPunctuation = False
    onlyAnd = False
    authorsDetected = False
    setChainStart = True
    startIndexAuthors = -1
    endIndexAuthors = -1
    chainStartIndex = -1
    changedText = ""
    
    df_PER = getPersonTags(text)
    index_df_PER_List = df_PER.index.values.tolist()
    
    if not df_PER.empty and df_PER["start"].iloc[0] == 0:
        for index in index_df_PER_List:
            #beachte: Hiermit lese ich immer schon vor!
            if index < len(index_df_PER_List) - 1:
                textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
            else:
                textBetweenNames = text[df_PER["end"].iloc[index]:]
            onlyPunctuation = isSpeceficPunctuation(textBetweenNames, ["&"])
            firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
            onlyAnd = textBetweenNames == andTyp
            if setChainStart: 
                chainStartIndex = df_PER["start"].iloc[index]
                setChainStart = False
            if not onlyPunctuation and not onlyAnd:
                setChainStart = True
                startIndexAuthors = chainStartIndex
                endIndexAuthors = df_PER["end"].iloc[index]
                break
        if startIndexAuthors > -1:
            changedText, author = replaceSubstring(startIndexAuthors, endIndexAuthors, text, ".")
            author = processNames(author)
            return changedText, author
    return text, ""

def getEditors(text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    editorRegEx = "\s*(\()?(Eds\.|Eds|Ed|ed|Ed\.|ed\.|eds\.|editor|editors)(\))?\s*"
    onlyPunctuation = False
    onlyAnd = False
    setChainStart = True
    isEditor = False
    startIndexEditors = -1
    endIndexEditors = -1
    chainStartIndex = -1
    startIndexEditorMarker = -1
    endIndexEditorMarker = -1
    
    df_PER = getPersonTags(text)
    index_df_PER_List = df_PER.index.values.tolist()
    if not df_PER.empty:
        for index in index_df_PER_List:
            if index < len(index_df_PER_List) - 1:
                textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
            else:
                textBetweenNames = text[df_PER["end"].iloc[index]:]
            onlyPunctuation = isSpeceficPunctuation(textBetweenNames, ["&"])
            firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
            onlyAnd = textBetweenNames == andTyp
            #if true, that a new chain of Authors begins. An Author Chain is for example "Name1, Name2 and Name3"
            if setChainStart: 
                chainStartIndex = df_PER["start"].iloc[index]
                setChainStart = False
            #If the following if-STatement is true, than the chain has reached an end
            if not onlyPunctuation and not onlyAnd:
                setChainStart = True
                #editors can be the first Part of an literature reference
                textFromStartUntilFirstName = text[0:df_PER["start"].iloc[0]]
                isEditor, startIndexEditorMarker, endIndexEditorMarker = is_Editor(editorRegEx, textFromStartUntilFirstName, 0, False)
                #print(f' getEditors, startIndexEditorMarker : {startIndexEditorMarker}')
                if startIndexEditorMarker == -1:
                    isEditor, startIndexEditorMarker, endIndexEditorMarker = is_Editor(editorRegEx, textBetweenNames, df_PER["end"].iloc[index])                         
                    #print(f' getEditors, startIndexEditorMarker : {startIndexEditorMarker}')
                if isEditor:
                    startIndexEditors = chainStartIndex
                    endIndexEditors = df_PER["end"].iloc[index]
                    break
    #print(f'getAuthorsAndEditors: return: {[startIndexAuthors,endIndexAuthors],[startIndexEditors, endIndexEditors]}')
    if startIndexEditors > -1:
        changedText, editor = replaceSubstring(startIndexEditors, endIndexEditors, text, ".")
        editor = processNames(editor)
        if startIndexEditorMarker > -1:
            if startIndexEditorMarker > startIndexEditors:
                startIndexEditorMarker = startIndexEditorMarker - (len(text) - len(changedText))
                endIndexEditorMarker = endIndexEditorMarker - (len(text) - len(changedText))
            changedText, buffer = replaceSubstring(startIndexEditorMarker, endIndexEditorMarker, changedText, ".")
        startIndexIn = 0 
        if startIndexEditorMarker < startIndexEditors:
            startIndexEditors = startIndexEditors - (endIndexEditorMarker - startIndexEditorMarker)
            endIndexEditors = endIndexEditors - (endIndexEditorMarker - startIndexEditorMarker)
        for i in range(startIndexEditors-1, -1, -1):
            if isSpeceficPunctuation(changedText[i], [":", " "]):
                startIndexIn = i + 1
                break
        #print(f' getEditors, startIndexIn : {startIndexIn}')
        #print(f' getEditors, startIndexEditors : {startIndexEditors}')
        changedText, replacedEditorMarker = replaceSubstring(startIndexIn, startIndexEditors, changedText, ".")
        return changedText, editor
    return text, ""
    
def getPersonTags(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty: 
        return df_outputs[df_outputs["entity_group"] == "PER"].reset_index(drop=True)
    return pd.DataFrame()

def getORGTag(text, score):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "ORG") & (df_outputs["score"] >= score)].reset_index(drop=True).tail(1)
    return pd.DataFrame()

def getLOCTag(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "LOC")].reset_index(drop=True)
    return pd.DataFrame()

def getDoi(text):
    doiUrlRegEx1 = "https:\/\/doi\.org(\/[^\s]*)?$"
    doiUrlRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org)?([^\s]*)+$"
    changedText, doi  = getSubstringByRegEx(text, [doiUrlRegEx1, doiUrlRegEx2])
    httpsDomainRegEx1 = "https:\/\/doi\.org\/"
    httpsDomainRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org\/)?"
    doi, httpsDomain = getSubstringByRegEx(doi, [httpsDomainRegEx1, httpsDomainRegEx2])
    return changedText, custom_strip(doi)

def getURL(text):
    urlRegEx = "(URL:|url:)?\s*https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?"
    changedText, url = getSubstringByRegEx(text, [urlRegEx])
    urlPrefixRegEx = r"(url:\s*|URL:\s*)"
    url = re.sub(urlPrefixRegEx, '', url).strip()
    return changedText, custom_strip(url)

def getDate(text):
    monthYearRegex = "(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?" \
    "|May|May\.?|June|Jun\.?|July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept\.?|October|" \
    "Oct\.?|November|Nov\.?|December|Dec\.?)\s\d{4}"
    changedText, monthYear  = getSubstringByRegEx(text, [monthYearRegex])
    #print(f' getDate, text={{{text}}}')
    if monthYear == "":
        yearRegEx1 = "(\.|,)? \(\d{4}\)(\.|,|:)"
        yearRegEx2 = "(\.|,) \d{4}(\.|,|;)"
        changedText, year  = getSubstringByRegEx(text, [yearRegEx1, yearRegEx2])
        #print(f' getDate, changedText={{{changedText}}}')
        return changedText, "", f'{year}'
    monthYear = monthYear.split(' ')
    return changedText, f'{monthYear[0]}', f'{monthYear[1]}'

def getPage(text):
    pageRegEx = "(?:pp\.? )?\d+(-|–)\d+"
    changedText, pages = getSubstringByRegEx(text, [pageRegEx])
    if pages != "":
        pages = re.search(r'\d+(-|–)\d+', pages).group()
    return changedText, custom_strip(pages)

def getSubstringByRegEx(text, regex = []):
    startIndex, endIndex, substring = getIndexOfSubstring(text, regex, True)
    #print(f' getSubstringByRegEx, startIndex={{{startIndex}}}')
    #print(f' getSubstringByRegEx, startIndex={{{endIndex}}}')
    #print(f' getSubstringByRegEx, text={{{substring}}}')
    changedText, substring = replaceSubstring(startIndex, endIndex, text, "")
    #print(f' getSubstringByRegEx, changedText={{{changedText}}}')
    return changedText, custom_strip(substring)

def getVolumeNumber(text):
    volumeAndNumberRegex = "(\d+\(\d+\)|\d+\.\d+)"
    startIndex, endIndex, substring = getIndexOfSubstring(text, [volumeAndNumberRegex], True)
    if startIndex > -1:
        changedText, volumeNumber = replaceSubstring(startIndex, endIndex, text, "")
        startIndex, endIndex, volume = getIndexOfSubstring(volumeNumber, ["\d+"])
        startIndex, endIndex, number = getIndexOfSubstring(volumeNumber, ["\d+"], True)
        return changedText, volume, number
    else:
        volumeRegEx = "(V|v)ol\. \d+"
        volumeRegEx2 = ", \d+,"
        volumeRegEx3 = ",? \d+(:|\.)"
        number1RegEx = "no\. \d+"
        number2RegEx = "Issue \d+"
        number3RegEx = ", \d+,"
        number4RegEx = "\.\d+"
        startIndex, endIndex, volume = getIndexOfSubstring(text, [volumeRegEx, volumeRegEx2, volumeRegEx3], True)
        changedText, substring = replaceSubstring(startIndex, endIndex, text, "")
        #print(f' getVolumeNumber, changedText={{{changedText}}}')
        startIndex, endIndex, number = getIndexOfSubstring(changedText, [number1RegEx, number2RegEx, number3RegEx, number4RegEx], True)
        changedText, substring = replaceSubstring(startIndex, endIndex, changedText, "")
        if volume != "":
            volume = re.search(r'\d+', volume).group(0)
        if number != "":
            number = re.search(r'\d+', number).group(0)
        return changedText, volume, number

def getEdition(text):
    editionRegEx1 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) ed\."
    editionRegEx2 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) edn\."
    changedText, edition = getSubstringByRegEx(text, [editionRegEx1, editionRegEx2])
    if edition != "":
        edition = re.search(r'\d+', edition).group()
    return changedText, custom_strip(edition)

def getAddress(text):
    df_LOC = getLOCTag(text)
    #print(f' df_LOC, df_LOC={{{df_LOC}}}')
    addressFound = False
    index_df_Loc_List = df_LOC.index.values.tolist()
    textBetweenAddress = ""
    setChainStart = True
    startIndex = 0
    endIndex = 0
    if not df_LOC.empty:
        for index in reversed(index_df_Loc_List):
            if index < len(index_df_Loc_List) and index > 0:
                textBetweenAddress = text[df_LOC["end"].iloc[index-1]:df_LOC["start"].iloc[index]]
            else:
                textBetweenAddress = text[:df_LOC["start"].iloc[index]]
            onlyPunctuation = isSpeceficPunctuation(textBetweenAddress, [])
            #wenn true, dann beginnt eine neue Autorenkette
            if setChainStart: 
                chainEnIndex = df_LOC["end"].iloc[index]
                #Solange das auf False, sollen der Substring erweitert werden, also start bleibt konstant
                setChainStart = False
            #Dann ist die Addressenkette zu Ende
            if not onlyPunctuation:
                startIndex = df_LOC["start"].iloc[index]
                endIndex = chainEnIndex
                break
        address = text[startIndex:endIndex]
        #If the chained range determined by the tagger corresponds to a string 
        #that is only delimited by punctuation before and after, then it is most likely a publisher.
        #startIndex - 2 because of a space inbetween
        #print(f' getAddress, text={{{text}}}')
        if startIndex > 2 and endIndex < len(text) - 1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                addressFound = True
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                addressFound = True
        if addressFound:
            changedText, address = replaceSubstring(startIndex, endIndex, text, "")
            return changedText, custom_strip(address)
    return text, ""  

def getPublisher(text, doi):
    publisher = ""
    if doi != "":
        url = f"https://api.crossref.org/works/{doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            publisher = data['message'].get('publisher', 'Publisher not found')
    if publisher != "":
        startIndex, endIndex, publisher = getIndexOfSubstring(text, [publisher], True)
        #double check
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, publisher
        changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
        if publisher != "":
            return changedText, custom_strip(publisher)
    df_ORG = getORGTag(text, 0.8)
    if not df_ORG.empty:
        startIndex = df_ORG["start"].iloc[0]
        endIndex = df_ORG["end"].iloc[0]
        publisher = text[startIndex:endIndex]
        #If the range determined by the tagger corresponds to a string 
        #that is only delimited by punctuation before and after, then it is most likely a publisher.
        #startIndex - 2 because of a space inbetween
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
    return text, ""
    
def getTitel(text):
    #print(f' getTitel 1, text={{{text}}}')
    ignoreCharacters = ["?", "!", "(", ")", "“", "”", "\""]
    text = custom_strip(text, ignoreCharacters)
    ignoreCharacters = ["?", ":", "-", "(", ")", "“", "”", "\""]
    limit = len(text) - 1
    i = 0
    maxIndex = 0
    #remove pairs of punctuation marks 
    while i < limit:
        if (i + 2 < limit) and not (text[i] == "," and text[i+1] == " " and not isSpeceficPunctuation(text[i+2])):
            if isSpeceficPunctuation(text[i], ignoreCharacters) and isSpeceficPunctuation(text[i+1], ignoreCharacters):
                text = text[:i] + "." + text[i+2:]
                i = i - 1
                limit = limit - 1
        i = i +1
    
    #print(f' getTitel 2, text={{{text}}}')
    ignoreCharacters = ["?", "!", "(", ")"]
    if text[0] == "“":
        text = text.rsplit('”', 1)
        return custom_strip(text[0], ignoreCharacters), custom_strip(text[1], ignoreCharacters), ""
    elif text[0] == "\"":
        text = text.rsplit("\"", 1)
        return custom_strip(text[0], ignoreCharacters), custom_strip(text[1], ignoreCharacters), ""
    elif text.count(".") == 1:
        #print(f' getTitel 3, text={{{text}}}')
        text = text.split(".")
        return custom_strip(text[0]), custom_strip(text[1]), ""
    elif text.count(".") == 2:
        text = text.split(".")
        return custom_strip(text[0]), custom_strip(text[1]), custom_strip(text[2])
    else:
        for index, element in enumerate(text):
            if isSpeceficPunctuation(element, [".", ",", " ", "(", ")", ":"]):
                if maxIndex < index:
                    maxIndex = index
        if maxIndex > 0:
            text = text.split(text[maxIndex])
            return custom_strip(text[0]), custom_strip(text[1]), ""
    if maxIndex == 0 and text.count(",") == 1: 
        #print(f' getTitel 4, text={{{text}}}')
        text = text.split(",")
        return custom_strip(text[0]), custom_strip(text[1]), ""
    elif maxIndex == 0 and text.count(",") > 1:
        text = text.rsplit(',', 1)
        return custom_strip(text[0]), custom_strip(text[1]), ""
    return custom_strip(text), "", ""
    
    
def getKey(author, year):
    lastNameFirstAuthor = author.split(" and ")[0].strip().split(" ")[-1]
    return f'{lastNameFirstAuthor}_{year}'


def create_bibtex(text):
    address = ""
    author = ""
    booktitle = ""
    chapter = ""
    doi = ""
    edition = ""
    editor = ""
    howpublished = ""
    isbn = ""
    journal = ""
    key = ""
    month = ""
    note = ""
    number = ""
    organization = ""
    pages = ""
    publisher = ""
    school = ""
    series = ""
    title = ""
    url = ""
    volume = ""
    year = ""
    key = ""
    isBook = False
    isProceedings = False
    isInProceedings = False
    isIncollection = False
    isArticle = False
    
    text, author = getAuthors(text)
    text, editor = getEditors(text)
    #print(f' main, text={{{text}}}')
    text, doi = getDoi(text)
    text, url = getURL(text)
    text, month, year = getDate(text)
    #print(f' main 2, text={{{text}}}')
    text, page = getPage(text)
    #print(f' main 3, text={{{text}}}')
    text, volume, number = getVolumeNumber(text)
    text, edition = getEdition(text)
    text, address = getAddress(text)
    text, publisher = getPublisher(text, doi)
    school = publisher
    #print(f' main 4, text={{{text}}}')
    title, booktitle, series = getTitel(text)
    journal = booktitle
    if author != "":
        key = getKey(author, year)
    else:
        key = getKey(editor, year)
    
    bookFields = [author, title, publisher, year, volume, number, \
                  series, address, edition, month, note, key, editor, \
                  howpublished, organization, chapter, pages, isbn, url]
    inproceedingsFields = [author, title, booktitle, year, editor, volume, \
                            number, series, pages, address, month, organization, \
                            publisher, note, key, doi, url]
    proceedingsFields = [title, year, editor, volume, number, series, \
                          address, month, organization, publisher, note, key, doi, url]
    incollectionFields = [author, title, booktitle, publisher, year, editor, \
                           volume, number, series, chapter, pages, address, \
                           edition, month, note, key, doi, url]
    articleFields = [author, title, journal, year, volume, number, \
                      pages, month, note, key, doi, url]
    phdthesisFields = [author, title, publisher, year, address, month, \
                        note, key, doi, url]
    
    bookFieldsString = ["author", "title", "publisher", "year", "volume", "number", \
                  "series", "address", "edition", "month", "note", "key", "editor", \
                  "howpublished", "organization", "chapter", "pages", "isbn", "url"]
    inproceedingsFieldsString  = ["author", "title", "booktitle", "year", "editor", "volume", \
                            "number", "series", "pages", "address", "month", "organization", \
                            "publisher", "note", "key", "doi", "url"]
    proceedingsFieldsString  = ["title", "year", "editor", "volume", "number", "series", \
                          "address", "month", "organization", "publisher", "note", "key", "doi", "url"]
    incollectionFieldsString  = ["author", "title", "booktitle", "publisher", "year", "editor", \
                           "volume", "number", "series", "chapter", "pages", "address", \
                           "edition", "month", "note", "key", "doi", "url"]
    articleFieldsString  = ["author", "title", "journal", "year", "volume", "number", \
                      "pages", "month", "note", "key", "doi", "url"]
    phdthesisFieldsString  = ["author", "title", "school", "year", "address", "month", \
                        "note", "key", "doi", "url"]

    models = [
        "LaLaf93/proceedings_recognizer",
        "LaLaf93/inproceedings_recognizer",
        "LaLaf93/book_recognizer",
        "LaLaf93/incollection_recognizer",
        "LaLaf93/article_recognizer",
        "LaLaf93/phdthesis_recognizer"
    ]

    labels = [
        "proceedings",
        "inproceedings",
        "book",
        "incollection",
        "article",
        "phdthesis"
    ]
    
    classifierDict = {}

    for model, label in zip(models, labels):
        result = pipeline("text-classification", model=model)(text)[0]
        classifierDict[label] = result
    
    #print(f'classifierDict: {classifierDict}')
    literatureType = ""
    highestScore = 0
    highetsScoreLabel = ""
    for entry in classifierDict.values():
        if entry['score'] > highestScore and not entry['label'].startswith('NON'):
            highestScore = entry['score']
            highetsScoreLabel = entry['label']
    literatureType = highetsScoreLabel
    #print(literatureType)
    if literatureType == "":
        for entry in classifierDict.values():
            lowestScore = 1
            lowestScoreLabel = ""
            if entry['score'] < lowestScore and entry['label'].startswith('NON'):
                lowestScore = entry['score']
                lowestScoreLabel = entry['label'].replace('NON', '')
        literatureType = lowestScoreLabel

    
    bibTex = "@"
    if literatureType == "book":
        zippedFieldsValues = zip(bookFieldsString, bookFields)
        zippedList = list(zippedFieldsValues)
        bibTex += f"book{{{key}, \n"
        for field in zippedList:
            bibTex += f'{field[0]}={{{field[1]}}},\n' 
    elif literatureType == "proceedings":
        zippedFieldsValues = zip(proceedingsFieldsString, proceedingsFields)
        zippedList = list(zippedFieldsValues)
        bibTex += f"proceedings{{{key}, \n"
        for field in zippedList:
            bibTex += f'{field[0]}={{{field[1]}}},\n' 
    elif literatureType == "inproceedings":
        zippedFieldsValues = zip(inproceedingsFieldsString, inproceedingsFields)
        zippedList = list(zippedFieldsValues)
        bibTex += f"inproceedings{{{key}, \n"
        for field in zippedList:
            bibTex += f'{field[0]}={{{field[1]}}},\n' 
    elif literatureType == "incollection":
        zippedFieldsValues = zip(incollectionFieldsString, incollectionFields)
        zippedList = list(zippedFieldsValues)
        bibTex += f"incollection{{{key}, \n"
        for field in zippedList:
            bibTex += f'{field[0]}={{{field[1]}}},\n' 
    elif literatureType == "article":
        zippedFieldsValues = zip(articleFieldsString, articleFields)
        zippedList = list(zippedFieldsValues)
        bibTex += f"article{{{key}, \n"
        for field in zippedList:
            bibTex += f'{field[0]}={{{field[1]}}},\n' 
    else:
        zippedFieldsValues = zip(phdthesisFieldsString, phdthesisFields)
        zippedList = list(zippedFieldsValues)
        bibTex += f"phdthesis{{{key}, \n"
        for field in zippedList:
            bibTex += f'{field[0]}={{{field[1]}}},\n' 
    
    #Idee: Mit Pos-Tagging herausfinden, wo Nomen etc. vorkommen und dann titel und Booktitel eingrenzen
    bibTex += '}'

    return bibTex 



  editorRegEx = "\s*(\()?(Eds\.|Eds|Ed|ed|Ed\.|ed\.|eds\.|editor|editors)(\))?\s*"
  doiUrlRegEx1 = "https:\/\/doi\.org(\/[^\s]*)?$"
  doiUrlRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org)?([^\s]*)+$"
  httpsDomainRegEx1 = "https:\/\/doi\.org\/"
  httpsDomainRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org\/)?"
  urlRegEx = "(URL:|url:)?\s*https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?"
  monthYearRegex = "(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?" \
  "|May|May\.?|June|Jun\.?|July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept\.?|October|" \
  "Oct\.?|November|Nov\.?|December|Dec\.?)\s\d{4}"
  yearRegEx1 = "(\.|,)? \(\d{4}\)(\.|,|:)"
  yearRegEx2 = "(\.|,) \d{4}(\.|,|;)"
  pageRegEx = "(?:pp\.? )?\d+(-|–)\d+"
  volumeAndNumberRegex = "(\d+\(\d+\)|\d+\.\d+)"
  startIndex, endIndex, volume = getIndexOfSubstring(volumeNumber, ["\d+"])
  startIndex, endIndex, number = getIndexOfSubstring(volumeNumber, ["\d+"], True)
  volumeRegEx = "(V|v)ol\. \d+"
  volumeRegEx2 = ", \d+,"
 

In [2]:
# Beispiele

text="Hinton, G., Bengio, Y., & LeCun, Y. (2021). Deep Learning for Artificial Intelligence. In J. Smith & J. Doe (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948"
#text = "M. A. Nielsen and I. L. Chuang, “Quantum Computation and Quantum Information,” in Handbook of Quantum Information Science, vol. 4, C. H. Bennett and D. P. DiVincenzo, Eds. Berlin, Germany: Springer, 2026, pp. 250–300. doi: 10.1007/springerreference-303198."
#text = "Nielsen, M. A.; Chuang, I. L. Quantum Computation and Quantum Information. In Handbook of Quantum Information Science; Bennett, C. H., DiVincenzo, D. P., Eds.; Quantum Science and Technology; Springer: Berlin, Germany, 2026; Vol. 4, pp 250–300. https://doi.org/10.1007/springerreference-303198."

#BUG: startIndexReplace={-1} ist hier bei getYear! Deswegen doppelter String drin
text = """Alahmed, Y., Abadla, R., Badri, A. A., & Ameen, N. (2023). “How Does ChatGPT Work” Examining Functionality, To The Creative AI CHATGPT on X’s (Twitter) Platform. 2023 Tenth International Conference on Social Networks Analysis, Management and Security (SNAMS), 1–7. https://doi.org/10.1109/SNAMS60348.2023.10375450"""
text = "David Mertz, Regular Expression Puzzles and AI Coding Assistants: 24 puzzles solved by the author, with and without assistance from Copilot, ChatGPT and more , Manning, 2023."
text = """Mohammed Baziyad, Ibrahim Kamel, and Tamer Rabie. 2023. On the Linguistic Limitations of ChatGPT: An Experimental Case Study. In 2023 International Symposium on Networks, Computers and Communications (ISNCC), 1–6. DOI:https://doi.org/10.1109/ISNCC58260.2023.10323661"""
#text = """K. M. Caramancion, "Harnessing the Power of ChatGPT to Decimate Mis/Disinformation: Using ChatGPT for Fake News Detection," 2023 IEEE World AI IoT Congress (AIIoT), Seattle, WA, USA, 2023, pp. 0042-0046, doi: 10.1109/AIIoT58121.2023.10174450."""
#text = """Hinton, G., Bengio, Y., & LeCun, Y. (2021). Deep Learning for Artificial Intelligence? In: J. Smith & J. Doe (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). IEEE Press. https://doi.org/10.1109/ICNN.2021.9483948"""
#text = """A. Einstein, B. Podolsky, and N. Rosen, “Can Quantum-Mechanical Description of Physical Reality Be Considered Complete?,” Physical Review, vol. 47, no. 10, pp. 777–780, May 1935, doi: 10.1103/PhysRev.47.777."""
#text = """Badaro, G., Saeed, M., & Papotti, P. (2023). Transformers for tabular data representation: a survey of models and applications. Transactions of the Association for Computational Linguistics, 11, pp. 227–249. URL: https://aclanthology.org/2023.tacl-1.14, doi:10.1162/tacl_a_00544"""
#text = """Shamane Siriwardhana, Rivindu Weerasekera, Elliott Wen, Tharindu Kaluarachchi, Rajib Rana, and Suranga Nanayakkara. 2023. Improving the domain adaptation of retrieval augmented generation (RAG) models for open domain question answering. Transactions of the Association for Computational Linguistics 11, (2023), 1–17. URL: https://aclanthology.org/2023.tacl-1.1, doi:10.1162/tacl_a_00530"""
#text = """Mansouri Bigvand, A., Bu, T., & Sarkar, A. (2017). Joint prediction of word alignment with alignment types. Transactions of the Association for Computational Linguistics, 5, pp. 501–514. URL: https://aclanthology.org/Q17-1035, doi:10.1162/tacl_a_00076"""
#text = """Mansouri Bigvand, A., Bu, T., & Sarkar, A. 2017. "Joint Prediction of Word Alignment with Alignment Types." Transactions of the Association for Computational Linguistics, 5: 501–514. URL: https://aclanthology.org/Q17-1035, doi:10.1162/tacl_a_00076."""
#text = """Mansouri Bigvand, A., Bu, T., & Sarkar, A. 2017. ‘Joint Prediction of Word Alignment with Alignment Types." Transactions of the Association forComputational Linguistics, 5: 501-514. URL: https://aclanthology.org/Q17-1035, doi:10.1162/tacl_a_00076."""
#text = """Devika K, Hariprasath .s.b, Haripriya B, Vigneshwar E, Premjith B, and Bharathi Raja Chakravarthi. From dataset to detection: a comprehensive approach to combating Malayalam fake news. In Bharathi Raja Chakravarthi, Ruba Priyadharshini, Anand Kumar Madasamy, Sajeetha Thavareesan, Elizabeth Sherly, Rajeswari Nadarajan, and Manikandan Ravikiran, editors, Proceedings of the Fourth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages, pages 16–23, St. Julian's, Malta, March 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.dravidianlangtech-1.3."""
#text = """R, Jairam, G, Jyothish, and B, Premjith. "A few-shot multi-accented speech classification for Indian languages using transformers and LLM's fine-tuning approaches." Proceedings of the Fourth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages. Eds. Chakravarthi, Bharathi Raja, Priyadharshini, Ruba, Madasamy, Anand Kumar, Thavareesan, Sajeetha, Sherly, Elizabeth, Nadarajan, Rajeswari, and Ravikiran, Manikandan. St. Julian's, Malta: Association for Computational Linguistics, 2024. 1–9. URL: https://aclanthology.org/2024.dravidianlangtech-1.1"""
text = """Rozovskaya, Alla, Roth, Dan, and Sammons, Mark. "Adapting to learner errors with minimal supervision." Computational Linguistics 43.4 (2017): 723–760. URL: https://aclanthology.org/J17-4002, doi:10.1162/COLI_a_00299"""
#text = """Eds. Alonso, Jose M., and Catala, Alejandro. Proceedings of the 1st Workshop on Interactive Natural Language Technology for Explainable Artificial Intelligence (NL4XAI 2019). 2019. URL: https://aclanthology.org/W19-8400"""
print(create_bibtex(text))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted

@article{Rozovskaya_2017, 
author={Alla Rozovskaya and Dan Roth and Mark Sammons},
title={Adapting to learner errors with minimal supervision},
journal={Computational Linguistics},
year={2017},
volume={43},
number={4},
pages={},
month={},
note={},
key={Rozovskaya_2017},
doi={10.1162/COLI_a_00299},
url={https://aclanthology.org/J17-4002},
}


In [4]:
# Einlesen von Testdaten für eine vorläufige Evaluation

import pandas as pd
import warnings
import logging
import time

# Startzeit erfassen
start_time = time.time()

# Unterdrücken aller Warnungen
warnings.filterwarnings("ignore")
# Setze das Logging-Level für Transformers auf ERROR
logging.getLogger("transformers").setLevel(logging.ERROR)

# Pfad zur CSV-Datei
csv_file_path = 'Trainingsdaten/Testdaten/test_acm.csv'

# Einlesen der CSV-Datei in einen Pandas DataFrame mit dem Delimiter |
df = pd.read_csv(csv_file_path, delimiter='|')
df = df.iloc[list(range(1, 6)) + list(range(300, 306)) + list(range(600, 606)) + list(range(900, 906)) + list(range(1200, 1206)) + list(range(1500, 1506))]

# Neue Spalte für Ergebnisse oder Fehler erstellen
df['Modellergebnis'] = None

# Fehlerbehandlung und Anwenden der Funktion
for index, row in df.iterrows():
    try:
        result = create_bibtex(row['Referenzstring'])
        df.at[index, 'Modellergebnis'] = result
    except Exception as e:
        df.at[index, 'Modellergebnis'] = f"Fehler: {e}"

display(df)

df.to_csv('Trainingsdaten/Testdaten/test_acm_result3.csv', sep='|', index=False)
#print(create_bibtex(text))

# Endzeit erfassen
end_time = time.time()

# Ausführungszeit berechnen
execution_time = end_time - start_time
print(f"Die Ausführung der Codezelle hat {execution_time:.4f} Sekunden gedauert.")

Unnamed: 0,Referenzstring,Style,Literaturtyp,BibTeX,Modellergebnis
1,"Yuan Zhang, Regina Barzilay, and Tommi Jaakkol...",acm,article,"@article{zhang-etal-2017-aspect,\n title = ...","@article{Zhang_2017, \nauthor={Yuan Zhang and ..."
2,"Ryan J. Gallagher, Kyle Reing, David Kale, and...",acm,article,"@article{gallagher-etal-2017-anchored,\n ti...","@article{Gallagher_2017, \nauthor={Ryan J. Gal..."
3,"Shafiq Joty, Francisco Guzmán, Lluís Màrquez, ...",acm,article,"@article{joty-etal-2017-discourse,\n title ...","@article{Joty_2017, \nauthor={Shafiq Joty and ..."
4,"Alla Rozovskaya, Dan Roth, and Mark Sammons. 2...",acm,article,"@article{rozovskaya-etal-2017-adapting,\n t...","@article{Rozovskaya_2017, \nauthor={Alla Rozov..."
5,"Ákos Kádár, Grzegorz Chrupała, and Afra Alisha...",acm,article,"@article{kadar-etal-2017-representation,\n ...","@article{Kádár_2017, \nauthor={Ákos Kádár and ..."
300,"Magda Ševčíková, Zdeněk Žabokrtský, Eleonora L...",acm,proceedings,"@proceedings{ws-2019-international-resources,\...","@phdthesis{Ševčíková_2019, \nauthor={Magda Šev..."
301,Jose M. Alonso and Alejandro Catala (Eds.). 20...,acm,proceedings,"@proceedings{ws-2019-interactive,\n title =...","@phdthesis{Alonso_2019, \nauthor={Jose M. Alon..."
302,"Yoshinobu Kano, Claus Aranha, Michimasa Inaba,...",acm,proceedings,"@proceedings{ws-2019-international-ai,\n ti...","@phdthesis{Kano_2019, \nauthor={Yoshinobu Kano..."
303,"Anusha Balakrishnan, Vera Demberg, Chandra Kha...",acm,proceedings,"@proceedings{ws-2019-discourse-structure,\n ...","@phdthesis{Balakrishnan_2019, \nauthor={Anusha..."
304,Alexandre Rademaker and Francis Tyers (Eds.). ...,acm,proceedings,"@proceedings{ws-2019-universal,\n title = ""...","@phdthesis{Rademaker_2019, \nauthor={Alexandre..."


Die Ausführung der Codezelle hat 255.6526 Sekunden gedauert.


In [2]:
# Einlesen aller Testdaten und Prozessierung von 36 x 100 Beispielen

import pandas as pd
import warnings
import logging
import time

# Unterdrücken aller Warnungen
warnings.filterwarnings("ignore")
# Setze das Logging-Level für Transformers auf ERROR
logging.getLogger("transformers").setLevel(logging.ERROR)

# Einlesen der CSV-Datei in einen Pandas DataFrame mit dem Delimiter |
df_acm = pd.read_csv('Trainingsdaten/Testdaten/test_acm.csv', delimiter='|')
df_apa = pd.read_csv('Trainingsdaten/Testdaten/test_apa.csv', delimiter='|')
df_harv = pd.read_csv('Trainingsdaten/Testdaten/test_harv.csv', delimiter='|')
df_ieee = pd.read_csv('Trainingsdaten/Testdaten/test_ieee.csv', delimiter='|')
df_mla = pd.read_csv('Trainingsdaten/Testdaten/test_mla.csv', delimiter='|')
df_plain = pd.read_csv('Trainingsdaten/Testdaten/test_plain.csv', delimiter='|')
df = pd.concat([df_acm, df_apa, df_harv, df_ieee, df_mla, df_plain])

styles = ['acm', 'apa', 'harvard', 'ieee', 'mla', 'plain']
types = ['phdthesis', 'article', 'book', 'inproceedings', 'proceedings', 'incollection']
df_result = pd.DataFrame()
df_time = pd.DataFrame()

# Durchlaufen der Stile und Literaturtypen und Filterung der ersten 100 Beispiele pro Attributkombination
# Konvertierung von je 100 Beispielen plus Zeitmessung
for style in styles:
    for type in types:
        print("Start: " + style + ", " + type)
        df_temp = df[(df["Style"]==style) & (df["Literaturtyp"]==type)]
        df_temp = df_temp.head(100)

        # Neue Spalte für Ergebnisse oder Fehler erstellen
        df_temp['Modellergebnis'] = None

        # Startzeit erfassen
        start_time = time.time()
        
        # Fehlerbehandlung und Anwenden des Modells
        for index, row in df_temp.iterrows():
            try:
                result = create_bibtex(row['Referenzstring'])
                df_temp.at[index, 'Modellergebnis'] = result
            except Exception as e:
                df_temp.at[index, 'Modellergebnis'] = f"Fehler: {e}"

        # Endzeit erfassen
        end_time = time.time()
        
        # Ausführungszeit berechnen (in Sekunden)
        execution_time = end_time - start_time

        # Temporäres DataFrame an das Haupt-DataFrame anhängen
        df_result = pd.concat([df_result, df_temp])
        #display(df_temp)

        df_time_temp = pd.DataFrame({'Style': [style], 'Literaturtyp': [type], 'Zeit': [execution_time]})
        print(df_time_temp)
        df_time = pd.concat([df_time, df_time_temp])

# Export der Ergebnisse
df_result.to_csv('Trainingsdaten/Testdaten/test_result.csv', sep='|', index=False)
df_time.to_csv('Trainingsdaten/Testdaten/test_time.csv', sep='|', index=False)

Start: acm, phdthesis
  Style Literaturtyp        Zeit
0   acm    phdthesis  636.630408
Start: acm, article
  Style Literaturtyp        Zeit
0   acm      article  663.219442
Start: acm, book
  Style Literaturtyp        Zeit
0   acm         book  617.174743
Start: acm, inproceedings
  Style   Literaturtyp        Zeit
0   acm  inproceedings  837.186132
Start: acm, proceedings
  Style Literaturtyp        Zeit
0   acm  proceedings  669.577584
Start: acm, incollection
  Style  Literaturtyp        Zeit
0   acm  incollection  654.223004
Start: apa, phdthesis
  Style Literaturtyp        Zeit
0   apa    phdthesis  666.490255
Start: apa, article
  Style Literaturtyp        Zeit
0   apa      article  703.541977
Start: apa, book
  Style Literaturtyp        Zeit
0   apa         book  618.058641
Start: apa, inproceedings
  Style   Literaturtyp        Zeit
0   apa  inproceedings  905.227921
Start: apa, proceedings
  Style Literaturtyp        Zeit
0   apa  proceedings  736.632788
Start: apa, incollect