@book{advanced_physics, author = {Stephen Hawking and Brian Greene and Maria Trump and Susan Soy}, title = {Advanced Concepts in Theoretical Physics}, publisher = {Cambridge University Press}, year = {2025}, edition = {3}, volume = {1}, series = {Advanced Studies in Physics}, address = {Cambridge, UK}, month = {May}, note = {A seminal work in the field of theoretical physics}, isbn = {978-0-521-76948-0}, doi = {10.1017/example.book.2025}, url = {https://www.cambridge.org/advanced_physics}, annote = {Widely cited in the physics community}, abstract = {This book explores cutting-edge theories and concepts in theoretical physics...}, keywords = {Theoretical Physics, Quantum Mechanics, String Theory}, language = {English}, price = {75.00}, size = {600 pages}, lccn = {2020934576}, mrnumber = {MR3070071} }

@article{quantum_entanglement,
  author        = {Albert Einstein and Boris Podolsky and Nathan Rosen},
  title         = {Can Quantum-Mechanical Description of Physical Reality Be Considered Complete?},
  journal       = {Physical Review},
  year          = {1935},
  volume        = {47},
  number        = {10},
  pages         = {777-780},
  month         = {May},
  note          = {EPR Paradox paper, fundamental for quantum mechanics},
  doi           = {10.1103/PhysRev.47.777},
  url           = {https://journals.aps.org/pr/abstract/10.1103/PhysRev.47.777},
  abstract      = {In this paper, the authors discuss the EPR paradox and challenge the completeness of quantum mechanics...},
  keywords      = {Quantum Mechanics, EPR Paradox, Physical Reality},
  language      = {English},
  publisher     = {American Physical Society}
}

@inproceedings{deep_learning,
  author        = {Geoffrey Hinton and Yoshua Bengio and Yann LeCun},
  title         = {Deep Learning for Artificial Intelligence},
  booktitle     = {Proceedings of the IEEE International Conference on Neural Networks},
  year          = {2021},
  editor        = {Jane Smith and John Doe},
  volume        = {1},
  number = {5},
  series        = {Advances in Neural Information Processing},
  pages         = {100-120},
  address       = {Montreal, Canada},
  month         = {December},
  organization  = {IEEE},
  publisher     = {IEEE Press},
  note          = {Keynote paper on recent advancements in deep learning},
  isbn          = {978-1-5386-4637-1},
  doi           = {10.1109/ICNN.2021.9483948},
  url           = {https://ieeexplore.ieee.org/document/9483948},
  annote        = {A seminal work on how deep learning transforms AI},
  abstract      = {This paper explores cutting-edge deep learning techniques and their impact on the development of artificial intelligence...},
  keywords      = {Deep Learning, Artificial Intelligence, Neural Networks},
  language      = {English}
}


@incollection{quantum_computation,
  author        = {Michael A. Nielsen and Isaac L. Chuang},
  title         = {Quantum Computation and Quantum Information},
  booktitle     = {Handbook of Quantum Information Science},
  publisher     = {Springer},
  year          = {2026},
  editor        = {Charles H. Bennett and David P. DiVincenzo},
  volume        = {4},
  series        = {Quantum Science and Technology},
  chapter       = {10},
  pages         = {250-300},
  address       = {Berlin, Germany},
  month         = {October},
  note          = {A comprehensive overview of the fundamentals of quantum computation},
  isbn          = {978-3-540-88702-7},
  doi           = {10.1007/springerreference-303198},
  url           = {https://www.springer.com/gp/book/9783540887027},
  annote        = {Essential reading for researchers entering the field of quantum information},
  abstract      = {This chapter delves into the principles of quantum computing, offering an accessible yet thorough introduction...},
  keywords      = {Quantum Computing, Quantum Information, Computational Models},
  language      = {English},
  price         = {45.00},
  size          = {50 pages}
}


In [72]:
import torch
import pandas as pd
torch.cuda.is_available()
from transformers import pipeline
import re
import string
import spacy
import requests

def custom_strip(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace
    for character in replaceCharacter:
        allowed_chars = allowed_chars.replace(character, '')
    return text.strip(allowed_chars)


def getIndexOfSubstring(text, regEx = [], reverse = False):
    #if reverse = False then it finds the first occurance of a given regEx.
    #if reverse = True, then it finds the last occurance of a given regEx.
    #beceause the occurance with the max length is taken, it always chooses the regex that covers the most letters
    length = 0
    matches = []
    substring = ""
    for regExElement in regEx:
        matches = list(re.finditer(regExElement, text))
        if matches:
            if reverse:
                match = matches[-1]
            else:
                match = matches[0]
            buffer = match.end() - match.start()
            if buffer > length:
                length = buffer
                startIndex = match.start()
                endIndex = match.end()
                substring = text[match.start():match.end()]
    if substring != "":
        return startIndex, endIndex, substring   
    return -1, -1, substring

def is_SurenameFirst(names):
    splitedNames = names.split(" ")
    #regex wie w+ erkennt bspw. KEIN è 
    if splitedNames[0].endswith(".") or not isSpeceficPunctuation(splitedNames[0][-1], []):
        return True
    return False
    
def is_NameShortened(df_PER):
    for index in df_PER.index.values.tolist():
        if "." == text[df_PER["end"].iloc[index]] and len(text[df_PER["start"].iloc[index]:df_PER["end"].iloc[index] + 1]) == 2:
            return True
    return False


def isSpeceficPunctuation(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace
    for character in replaceCharacter:
        allowed_chars = allowed_chars.replace(character, '')
    return all(char in allowed_chars for char in text)

def is_Editor(editorRegEx, textBetweenNames, index, text):
    if re.search(editorRegEx, textBetweenNames):
        x = re.search(editorRegEx, text)
        #print(f'x: {x.start()}')
        if isSpeceficPunctuation(text[index:x.start()], ["&"]):
            return True
    return False

def processNames(authors):
    finalAuthors = ""
    search_terms = [" and ", ", and ", " & ", ", & "]
    #andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
    surenameFirst = is_SurenameFirst(authors.strip())
    print(f'function processNames, surenameFirst: {surenameFirst}')
    if surenameFirst:
        startIndex, endIndex, andInAuthors = getIndexOfSubstring(authors, search_terms)
        print("Fall surenameFirst".format(authors))
        #hier völlig egal, ob er einzelne Initialen in ein eigenes Word gesteckt hat, obwohl es noch Nachnamen gib
        if startIndex >= 0:
            authors = authors.replace(andInAuthors, " and ")
            #print(f'authors: {authors}')
            finalAuthors = authors.replace(", ", " and ")
        else:
            finalAuthors = authors
    elif "., " in authors:
        print("Fall ., {0}".format(authors))
        search_terms = ["., and ", "., & ", ". and ", ". & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":
            authors = authors.replace(andInAuthors, "., ")
        authors = authors.split("., ")
        authors = [name + "." for name in authors]
        authors = [name.replace("..",".") for name in authors]
        for author in authors[:-1]:
            buffer = author.split(", ")
            finalAuthors = finalAuthors + buffer[1] + " " + buffer[0] + " and "
        buffer = authors[-1].split(", ")
        finalAuthors = finalAuthors + buffer[1] + " " +  buffer[0]
    elif ".; " in authors:
        print("Fall .; {0}".format(authors))
        search_terms = ["., and ", "., & ", ". and ", ". & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":
            authors = authors.replace(andInAuthors, ".; ")
        authors = authors.split(".; ")
        authors = [name + "." for name in authors]
        authors = [name.replace("..",".") for name in authors]
        for author in authors[:-1]:
            buffer = author.split(", ")
            finalAuthors = finalAuthors + buffer[1] + " " + buffer[0] + " and "
        buffer = authors[-1].split(", ")
        finalAuthors = finalAuthors + buffer[1] + " " +  buffer[0]
    elif ", " in authors:
        print("Fall , {0}".format(authors))
        search_terms = [", and ", ", & ", " and ", " & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":  
            authors = authors.replace(andInAuthors, ", ")
        authors = authors.split(", ")
        for i in range(0, len(authors) - 3, 2):
            finalAuthors = finalAuthors + authors[i+1] + " " + authors[i] + " and "
        finalAuthors = finalAuthors + authors[len(authors) - 1] + " " + authors[len(authors) - 2]
    return custom_strip(finalAuthors)

def getAuthors(text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    
    onlyPunctuation = False
    onlyAnd = False
    authorsDetected = False
    setChainStart = True
    startIndexAuthors = -1
    endIndexAuthors = -1
    chainStartIndex = -1
    changedText = ""
    
    df_PER = getPersonTags(text)
    index_df_PER_List = df_PER.index.values.tolist()
    
    for index in index_df_PER_List:
        #beachte: Hiermit lese ich immer schon vor!
        if index < len(index_df_PER_List) - 1:
            textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
        else:
            textBetweenNames = text[df_PER["end"].iloc[index]:]
        onlyPunctuation = isSpeceficPunctuation(textBetweenNames, ["&"])
        firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
        onlyAnd = textBetweenNames == andTyp
        if setChainStart: 
            chainStartIndex = df_PER["start"].iloc[index]
            setChainStart = False
        if not onlyPunctuation and not onlyAnd:
            setChainStart = True
            startIndexAuthors = chainStartIndex
            endIndexAuthors = df_PER["end"].iloc[index]
            break
    if startIndexAuthors > -1:
        changedText, author = replaceSubstring(startIndexAuthors, endIndexAuthors, text, ".")
        author = processNames(author)
        return changedText, author
    return text, ""

def getEditors(text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    editorRegEx = "(\()?(Eds\.|Eds|Ed|ed|Ed\.|ed\.|eds\.|editor|editors)(\))?"
    onlyPunctuation = False
    onlyAnd = False
    setChainStart = True
    isEditor = False
    startIndexEditors = -1
    endIndexEditors = -1
    chainStartIndex = -1
    
    df_PER = getPersonTags(text)
    index_df_PER_List = df_PER.index.values.tolist()

    for index in index_df_PER_List:
        if index < len(index_df_PER_List) - 1:
            textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
        else:
            textBetweenNames = text[df_PER["end"].iloc[index]:]
        onlyPunctuation = isSpeceficPunctuation(textBetweenNames, ["&"])
        firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
        onlyAnd = textBetweenNames == andTyp
        #wenn true, dann beginnt eine neue Autorenkette
        if setChainStart: 
            chainStartIndex = df_PER["start"].iloc[index]
            #Solange das auf False, sollen der Substring erweitert werden, also start bleibt konstant
            setChainStart = False
        #Dann ist die Autorenkette zu Ende
        if not onlyPunctuation and not onlyAnd:
            setChainStart = True
            #Es können auch nur Editoren und keine Autoren vorkommen
            isEditor = is_Editor(editorRegEx, textBetweenNames, df_PER["end"].iloc[index], text)
            if isEditor:
                startIndexEditors = chainStartIndex
                endIndexEditors = df_PER["end"].iloc[index]
                break
    #print(f'getAuthorsAndEditors: return: {[startIndexAuthors,endIndexAuthors],[startIndexEditors, endIndexEditors]}')
    if startIndexEditors > -1:
        changedText, editor = replaceSubstring(startIndexEditors, endIndexEditors, text, "#EDITOR#")
        editor = processNames(editor)
        print(f'text after replace editors : {changedText}')
        #es soll erst ab Editors gesucht werden, daher text[endIndexEditors:]. Sonst Verwechslungsgefahr
        startIndexEditors, endIndexEditors, buffer = getIndexOfSubstring(changedText, ["#EDITOR#"])
        startIndexEditorMarker, endIndexEditorMarker, buffer = getIndexOfSubstring(changedText[endIndexEditors:], [editorRegEx])
        startIndexEditorMarker = startIndexEditorMarker + endIndexEditors
        endIndexEditorMarker = endIndexEditorMarker + endIndexEditors
        changedText, replacedEditorMarker = replaceSubstring(startIndexEditorMarker, endIndexEditorMarker, changedText, ".")
        print(f'text after replace editorsMarker : {changedText}')
        startIndexEditors, endIndexEditors, buffer = getIndexOfSubstring(changedText, ["#EDITOR#"])
        startIndexIn = 0 
        for i in range(startIndexEditors-1, -1, -1):
            if isSpeceficPunctuation(changedText[i], [":", " "]):
                startIndexIn = i + 1
                break
        print(f' getEditors, startIndexIn : {startIndexIn}')
        print(f' getEditors, startIndexEditors : {startIndexEditors}')
        changedText, replacedEditorMarker = replaceSubstring(startIndexIn, startIndexEditors, changedText, ".")
        
        # #.# tritt auf, weil ich # nicht ignoriere
        startIndexEditor, endIndexEditor, buffer = getIndexOfSubstring(changedText, ["#EDITOR#"])
        changedText, replacedEditorMarker = replaceSubstring(startIndexEditor, endIndexEditor, changedText, ".")
        return changedText, editor
    return text, ""

def getPublisher(text, doi):
    publisher = ""
    if doi != "":
        url = f"https://api.crossref.org/works/{doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            publisher = data['message'].get('publisher', 'Publisher not found')
    if publisher != "":
        startIndex, endIndex, publisher = getIndexOfSubstring(text, [publisher], True)
        #double check
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, publisher
        changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
        if publisher != "":
            return changedText, custom_strip(publisher)
    df_ORG = getORGTag(text, 0.8)
    if not df_ORG.empty:
        startIndex = df_ORG["start"].iloc[0]
        endIndex = df_ORG["end"].iloc[0]
        publisher = text[startIndex:endIndex]
        #If the range determined by the tagger corresponds to a string 
        #that is only delimited by punctuation before and after, then it is most likely a publisher.
        #startIndex - 2 because of a space inbetween
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
    return text, ""
            


def replaceSubstring (startIndex, endIndex, text, substituteString, ignorePunctuation = ["&", "(", ")"]):
    #The regex also checks for punctuation so that it is particularly precise. 
    #The cut text however should in normale mode be without the front delimiter of the bibTex fields in the bibiography, 
    #so that future regex are not affected. But the last delimiter belongs to the cut word so this should be removed
    if endIndex > 0:
        startIndexReplace = 0
        endIndexReplace = 0
        if startIndex > 0:
            for i in range(startIndex, -1, -1):
                if isSpeceficPunctuation(text[i], ignorePunctuation):
                    startIndexReplace = i + 1
                    break
        else:
            startIndexReplace = 0            
        #print(f' replaceSubstring, startIndexReplace={{{startIndexReplace}}}')
        if endIndex < len(text):
            for i in range(endIndex-1, len(text), 1):
                if isSpeceficPunctuation(text[i], ignorePunctuation):
                    endIndexReplace = i + 1
                    break
                elif i == len(text)-1:
                    endIndexReplace = len(text)
        else:
            endIndexReplace = len(text)
        #print(f' replaceSubstring, endIndexReplace={{{endIndexReplace}}}')
        if endIndexReplace > 0:
            changedText = text[0:startIndexReplace] + substituteString + text[endIndexReplace:len(text)]
            return changedText, text[startIndexReplace:endIndexReplace]
    return text, ""

def getAddress(text):
    overflow = 0
    df_LOC = getLOCTag(text)
    addressFound = False
    if not df_LOC.empty:
        startIndex = df_LOC["start"].iloc[-1]
        endIndex = df_LOC["end"].iloc[-1]
        address = text[startIndex:endIndex]
        #If the range determined by the tagger corresponds to a string 
        #that is only delimited by punctuation before and after, then it is most likely a publisher.
        #startIndex - 2 because of a space inbetween
        print(f' getAddress, len(text)={{{len(text)}}}')
        print(f' getAddress, endIndex={{{endIndex}}}')
        print(f' getAddress, text={{{text}}}')
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                addressFound = True
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                addressFound = True
        if len(df_LOC) > 1 and addressFound:
            startIndex2 = df_LOC["start"].iloc[0]
            endIndex2 = df_LOC["end"].iloc[0]
            address2 = text[startIndex2:endIndex2]
            if isSpeceficPunctuation(text[startIndex2 - 2]) and isSpeceficPunctuation(text[endIndex2:startIndex]):
                changedText, address = replaceSubstring(startIndex2, endIndex, text, "")
                return changedText, custom_strip(address)
        if addressFound:
            changedText, address = replaceSubstring(startIndex, endIndex, text, "")
            return changedText, custom_strip(address)
    return text, ""

def getDate(text):
    monthYearRegex = "\b(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?" \
    "|May|May\.?|June|Jun\.?|July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept\.?|October|" \
    "Oct\.?|November|Nov\.?|December|Dec\.?)\s\d{4}\b"
    changedText, monthYear  = getSubstringByRegEx(text, [monthYearRegex])
    print(f' getDate, text={{{text}}}')
    if monthYear == "":
        yearRegEx1 = "(\(\d{4}\)|\. \d{4}\.)"
        yearRegEx2 = "(\.|,) \d{4}(\.|,|;)"
        changedText, year  = getSubstringByRegEx(text, [yearRegEx1, yearRegEx2])
        print(f' getDate, year={{{year}}}')
        return changedText, "", f'year={{{year}}}'
    monthYear = monthYear.split(' ')
    return changedText, f'month={{{monthYear[0]}}}', f'year={{{monthYear[1]}}}'
    
def getTitel(text):
    print(f' getTitel 1, text={{{text}}}')
    ignoreCharacters = ["?", ":", "-", "(", ")"]
    text = custom_strip(text, ignoreCharacters)
    limit = len(text) - 1
    i = 0
    #remove pairs of punctuation marks 
    #Bug: AKtuell wird auch ", " dann entfernt. Prüfen, ob ", ," vorliegt
    while i < limit:
        if (i + 2 < limit) and not (text[i] == "," and text[i+1] == " " and not isSpeceficPunctuation(text[i+2])):
            if isSpeceficPunctuation(text[i], ignoreCharacters) and isSpeceficPunctuation(text[i+1], ignoreCharacters):
                text = text[:i] + "." + text[i+2:]
                i = i - 1
                limit = limit - 1
        i = i +1
    print(f' getTitel 2, text={{{text}}}')
    if text.count(".") == 1:
        text = text.split(".")
    elif text.count(",") == 1: 
        text = text.split(",")
    elif text.count(".") == 2:
        text = text.split(".")
        return custom_strip(text[0]), custom_strip(text[1]), custom_strip(text[2])
    elif text.count(",") > 1:
        text = text.rsplit(',', 1)
    return text[0], text[1], ""

def getPersonTags(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty: 
        return df_outputs[df_outputs["entity_group"] == "PER"].reset_index(drop=True)
    return pd.DataFrame()

def getORGTag(text, score):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "ORG") & (df_outputs["score"] >= score)].reset_index(drop=True).tail(1)
    return pd.DataFrame()

def getLOCTag(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "LOC")].reset_index(drop=True).tail(2)
    return pd.DataFrame()

def getDoi(text):
    doiUrlRegEx1 = "https:\/\/doi\.org(\/[^\s]*)?$"
    doiUrlRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org)?([^\s]*)+$"
    text, doi  = getSubstringByRegEx(text, [doiUrlRegEx1, doiUrlRegEx2])
    httpsDomainRegEx1 = "https:\/\/doi\.org\/"
    httpsDomainRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org\/)?"
    doi, httpsDomain = getSubstringByRegEx(doi, [httpsDomainRegEx1, httpsDomainRegEx2])
    return text, custom_strip(doi)

def getSubstringByRegEx(text, regex = []):
    startIndex, endIndex, substring = getIndexOfSubstring(text, regex, True)
    #print(f' getSubstringByRegEx, startIndex={{{startIndex}}}')
    #print(f' getSubstringByRegEx, startIndex={{{endIndex}}}')
    #print(f' getSubstringByRegEx, text={{{substring}}}')
    changedText, substring = replaceSubstring(startIndex, endIndex, text, "")
    #print(f' getSubstringByRegEx, changedText={{{changedText}}}')
    return changedText, custom_strip(substring)

#search_terms = [", et al.", " et al."]
#firstStartIndex, firstEndIndex, etAl = find_First_Term(text, search_terms)
#if firstStartIndex > -1:
    #text = replaceSubstring(firstStartIndex, firstEndIndex, text, ", ")

def create_bibtex(text):
    author = ""
    editor = ""
    title = ""
    booktitle = ""
    journal = ""
    series = ""
    year = ""
    volume = ""
    number = ""
    edition = ""
    pages = ""
    month = ""
    publisher = ""
    school = ""
    address = ""
    note = ""
    annote = ""
    doi = ""
    url = ""
    book = False
    article = False
    proceedings = False
    inproceedings = False
    incollection = False
    phdThesis = False
    
    urlRegEx = "https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?"
    pageRegEx = "(?:pp\.? )?\d+(-|–)\d+"
    volumeRegEx = "(V|v)ol\. \d+"
    number1RegEx = "no\. \d+"
    number2RegEx = "Issue \d+"
    #number3RegEx = "\d+"
    edition1 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) ed\."
    edition2 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) edn\."
    
    text, author = getAuthors(text)
    text, editor = getEditors(text)
    text, doi = getDoi(text)
    text, url = getSubstringByRegEx(text, [urlRegEx])
    text, month, year = getDate(text)
    text, page = getSubstringByRegEx(text, [pageRegEx])
    print(f' main, text={{{text}}}')
    text, volume = getSubstringByRegEx(text, [volumeRegEx])
    text, number = getSubstringByRegEx(text, [number1RegEx, number2RegEx])
    text, edition = getSubstringByRegEx(text, [edition1, edition2])
    #BUGFIX: Wenn nur num vorkommt, dann schneidet volume die Zahl von num aus!!!!!!
    #volume3 darf also erst geprüft werden, wenn num1 und num2 geprüft wurden.
    #VOlumer erscheint aber immre vor number
    print(f' main, text2={{{text}}}')
    text, address = getAddress(text)
    text, publisher = getPublisher(text, doi)
    titel, booktitel, series = getTitel(text)
    
    #Idee: Mit Pos-Tagging herausfinden, wo Nomen etc. vorkommen und dann titel und Booktitel eingrenzen

    return f'authors: {author}' + ", \r\n" + f'editors: {editor}' \
+ ", \r\n" + f'doi: {doi}' +  ", \r\n"  +  f'{year}' +  ", \r\n"  + f'number : {number}' \
+  ", \r\n" + f'volume : {volume}' +  ", \r\n"  + f'edition: {edition}' +  ", \r\n"  + f'page: {page}' \
+  ", \r\n"  + f'url: {url}' +  ", \r\n"  + f'publisher: {publisher}' +  ", \r\n"  + f'address: {address}' \
+  ", \r\n" + f'titel: {titel}' +  ", \r\n"  + f'booktitel: {booktitel}' 



In [73]:
text="Hinton, G., Bengio, Y., & LeCun, Y. (2021). Deep Learning for Artificial Intelligence. In J. Smith & J. Doe (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948"
#text = "M. A. Nielsen and I. L. Chuang, “Quantum Computation and Quantum Information,” in Handbook of Quantum Information Science, vol. 4, C. H. Bennett and D. P. DiVincenzo, Eds. Berlin, Germany: Springer, 2026, pp. 250–300. doi: 10.1007/springerreference-303198."
#text = "Nielsen, M. A.; Chuang, I. L. Quantum Computation and Quantum Information. In Handbook of Quantum Information Science; Bennett, C. H., DiVincenzo, D. P., Eds.; Quantum Science and Technology; Springer: Berlin, Germany, 2026; Vol. 4, pp 250–300. https://doi.org/10.1007/springerreference-303198."

#BUG: startIndexReplace={-1} ist hier bei getYear! Deswegen doppelter String drin
text = """Alahmed, Y., Abadla, R., Badri, A. A., & Ameen, N. (2023). “How Does ChatGPT Work” Examining Functionality, To The Creative AI CHATGPT on X’s (Twitter) Platform. 2023 Tenth International Conference on Social Networks Analysis, Management and Security (SNAMS), 1–7. https://doi.org/10.1109/SNAMS60348.2023.10375450"""
text = "David Mertz, Regular Expression Puzzles and AI Coding Assistants: 24 puzzles solved by the author, with and without assistance from Copilot, ChatGPT and more , Manning, 2023."
text = """Mohammed Baziyad, Ibrahim Kamel, and Tamer Rabie. 2023. On the Linguistic Limitations of ChatGPT: An Experimental Case Study. In 2023 International Symposium on Networks, Computers and Communications (ISNCC), 1–6. DOI:https://doi.org/10.1109/ISNCC58260.2023.10323661"""
print(create_bibtex(text))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted

function processNames, surenameFirst: True
Fall surenameFirst


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


 getDate, text={. 2023. On the Linguistic Limitations of ChatGPT: An Experimental Case Study. In 2023 International Symposium on Networks, Computers and Communications (ISNCC), 1–6. }
 getDate, year={2023}
 main, text={ On the Linguistic Limitations of ChatGPT: An Experimental Case Study. In 2023 International Symposium on Networks, Computers and Communications (ISNCC),  }
 main, text2={ On the Linguistic Limitations of ChatGPT: An Experimental Case Study. In 2023 International Symposium on Networks, Computers and Communications (ISNCC),  }


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 getTitel 1, text={ On the Linguistic Limitations of ChatGPT: An Experimental Case Study. In 2023 International Symposium on Networks, Computers and Communications (ISNCC),  }
 getTitel 2, text={On the Linguistic Limitations of ChatGPT: An Experimental Case Study.In 2023 International Symposium on Networks, Computers and Communications (ISNCC)}
authors: Mohammed Baziyad and Ibrahim Kamel and Tamer Rabie, 
editors: , 
doi: 10.1109/ISNCC58260.2023.10323661, 
year={2023}, 
number : , 
volume : , 
edition: , 
page: 1–6, 
url: , 
publisher: , 
address: , 
titel: On the Linguistic Limitations of ChatGPT: An Experimental Case Study, 
booktitel: In 2023 International Symposium on Networks, Computers and Communications (ISNCC)


Idee: Nun zunächst Jahr, Volume, Seiten, Edition und URL/DOI extrahieren. Den Rest (also Titel, Publisher, Series), dann nochmal den SpacyParer drüber laufen lassen, weil 

In [70]:
#Test, um Datum Zuverlässig erkannt wird

import spacy
from spacy import displacy



text = "Hinton, G., Bengio, Y., & LeCun, Y. (2021). Deep Learning for Artificial Intelligence, In Proceedings of the IEEE International Conference on Neural Networks, Physical Review."
text = """"What Drives IT Students Toward ChatGPT? Analyzing the Factors Influencing Students' Intention to Use ChatGPT for Educational Purposes," 2024 21st International Multi-Conference on Systems, Signals & Devices (SSD)"""

#text = "dsfsdf London"
print(len(text))
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Visualisierung der Abhängigkeiten
displacy.render(doc, style='dep', jupyter=True)
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
#for ent in doc.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)

for token in doc:
    print(f'Text: {token.text}, POS: {token.pos_}, Head: {token.head.text}, Dep: {token.dep_}')

ner_tagger = pipeline("ner", aggregation_strategy="simple")
outputs = ner_tagger(text)
df_outputs = pd.DataFrame(outputs)
print(df_outputs)
pos_pipeline = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english")
results = pos_pipeline(text)
for result in results:
    print(result)
    print("---------------------------------------------------------------------------")
    #print(f"Word: {result['word']}, POS Tag: {result['entity']}")

213


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Text: ", POS: PUNCT, Head: Students, Dep: punct
Text: What, POS: PRON, Head: Drives, Dep: nsubj
Text: Drives, POS: VERB, Head: Students, Dep: csubj
Text: IT, POS: PRON, Head: Students, Dep: compound
Text: Students, POS: NOUN, Head: Students, Dep: ROOT
Text: Toward, POS: ADP, Head: Students, Dep: prep
Text: ChatGPT, POS: NOUN, Head: Toward, Dep: pobj
Text: ?, POS: PUNCT, Head: Students, Dep: punct
Text: Analyzing, POS: VERB, Head: Analyzing, Dep: ROOT
Text: the, POS: DET, Head: Factors, Dep: det
Text: Factors, POS: NOUN, Head: Analyzing, Dep: dobj
Text: Influencing, POS: VERB, Head: Factors, Dep: acl
Text: Students, POS: PROPN, Head: Intention, Dep: poss
Text: ', POS: PART, Head: Students, Dep: case
Text: Intention, POS: NOUN, Head: Influencing, Dep: dobj
Text: to, POS: PART, Head: Use, Dep: aux
Text: Use, POS: VERB, Head: Influencing, Dep: xcomp
Text: ChatGPT, POS: NOUN, Head: Use, Dep: dobj
Text: for, POS: ADP, Head: ChatGPT, Dep: prep
Text: Educational, POS: PROPN, Head: Purposes, De

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  entity_group     score                   word  start  end
0          ORG  0.577290                     IT     13   15
1         MISC  0.982862    International Multi    147  166
2         MISC  0.767340  Conference on Systems    167  188
3          ORG  0.379331                      ,    188  189
4         MISC  0.555999                Signals    190  197
5          ORG  0.612880                      &    198  199
6         MISC  0.757536                Devices    200  207
7         MISC  0.718369                    SSD    209  212


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'I-ORG', 'score': 0.57729024, 'index': 5, 'word': 'IT', 'start': 13, 'end': 15}
---------------------------------------------------------------------------
{'entity': 'I-MISC', 'score': 0.9796957, 'index': 45, 'word': 'International', 'start': 147, 'end': 160}
---------------------------------------------------------------------------
{'entity': 'I-MISC', 'score': 0.98602736, 'index': 46, 'word': 'Multi', 'start': 161, 'end': 166}
---------------------------------------------------------------------------
{'entity': 'I-MISC', 'score': 0.9863149, 'index': 48, 'word': 'Conference', 'start': 167, 'end': 177}
---------------------------------------------------------------------------
{'entity': 'I-MISC', 'score': 0.6964862, 'index': 49, 'word': 'on', 'start': 178, 'end': 180}
---------------------------------------------------------------------------
{'entity': 'I-MISC', 'score': 0.61921793, 'index': 50, 'word': 'Systems', 'start': 181, 'end': 188}
------------------------------

In [14]:
from scholarly import scholarly

# Suche nach einem Autor
search_query = scholarly.search_author('Fernández, R.')
author = next(search_query)

# Laden Sie das komplette Autorenprofil
author = scholarly.fill(author)

# Zeigen Sie einige Informationen über den Autor an
print(f"Name: {author['name']}")
print(f"Affiliation: {author['affiliation']}")

# Zeigen Sie die ersten fünf Veröffentlichungen an
for publication in author['publications'][:5]:
    pub = scholarly.fill(publication)
    print(f"\nTitle: {pub['bib']['title']}")
    print(f"Authors: {pub['bib']['author']}")
    print(f"Year: {pub['bib']['pub_year']}")

Name: T.R. Fernandez Perez Tomei
Affiliation: Researcher IV, Universidade Estadual Paulista

Title: Observation of a new boson at a mass of 125 GeV with the CMS experiment at the LHC
Authors: Serguei Chatrchyan and Vardan Khachatryan and Albert M Sirunyan and Armen Tumasyan and Wolfgang Adam and Ernest Aguilo and Thomas Bergauer and Marko Dragicevic and Janos Erö and Christian Fabjan and M Friedl and Rudolf Frühwirth and VM Ghete and J Hammer and M Hoch and N Hörmann and J Hrubec and M Jeitler and W Kiesenhofer and V Knünz and M Krammer and I Krätschmer and D Liko and W Majerotto and I Mikulec and M Pernicka and B Rahbaran and C Rohringer and H Rohringer and R Schöfbeck and J Strauss and F Szoncsó and A Taurok and W Waltenberger and G Walzel and E Widl and C-E Wulz and I Emeliantchik and V Makarenko and N Shumeiko and A Solin and R Stefanovitch and J Suarez Gonzalez and A Fedorov and M Korzhik and O Missevitch and R Zuyeuski and M Bansal and S Bansal and W Beaumont and Tom Cornelis and


Title: Evidence for the 125 GeV Higgs boson decaying to a pair of τ leptons
Authors: Serguei Chatrchyan and Vardan Khachatryan and Albert M Sirunyan and Armen Tumasyan and Wolfgang Adam and Thomas Bergauer and Marko Dragicevic and Janos Erö and Christian Fabjan and Markus Friedl and Rudolf Fruehwirth and Vasile Mihai Ghete and Christian Hartl and Natascha Hörmann and Josef Hrubec and Manfred Jeitler and Wolfgang Kiesenhofer and Valentin Knünz and Manfred Krammer and Ilse Krätschmer and Dietrich Liko and Ivan Mikulec and Dinyar Rabady and Babak Rahbaran and Herbert Rohringer and Robert Schöfbeck and Josef Strauss and Anton Taurok and Wolfgang Treberer-Treberspurg and Wolfgang Waltenberger and C-E Wulz and N Shumeiko and S Alderweireldt and M Bansal and S Bansal and T Cornelis and EA De Wolf and X Janssen and A Knutsson and S Luyckx and S Ochesanu and B Roland and R Rougny and H Van Haevermaet and P Van Mechelen and A Van Spilbeeck and F Blekman and S Blyweert and J D’Hondt and N Heracl