@book{advanced_physics, author = {Stephen Hawking and Brian Greene and Maria Trump and Susan Soy}, title = {Advanced Concepts in Theoretical Physics}, publisher = {Cambridge University Press}, year = {2025}, edition = {3}, volume = {1}, series = {Advanced Studies in Physics}, address = {Cambridge, UK}, month = {May}, note = {A seminal work in the field of theoretical physics}, isbn = {978-0-521-76948-0}, doi = {10.1017/example.book.2025}, url = {https://www.cambridge.org/advanced_physics}, annote = {Widely cited in the physics community}, abstract = {This book explores cutting-edge theories and concepts in theoretical physics...}, keywords = {Theoretical Physics, Quantum Mechanics, String Theory}, language = {English}, price = {75.00}, size = {600 pages}, lccn = {2020934576}, mrnumber = {MR3070071} }

@article{quantum_entanglement,
  author        = {Albert Einstein and Boris Podolsky and Nathan Rosen},
  title         = {Can Quantum-Mechanical Description of Physical Reality Be Considered Complete?},
  journal       = {Physical Review},
  year          = {1935},
  volume        = {47},
  number        = {10},
  pages         = {777-780},
  month         = {May},
  note          = {EPR Paradox paper, fundamental for quantum mechanics},
  doi           = {10.1103/PhysRev.47.777},
  url           = {https://journals.aps.org/pr/abstract/10.1103/PhysRev.47.777},
  abstract      = {In this paper, the authors discuss the EPR paradox and challenge the completeness of quantum mechanics...},
  keywords      = {Quantum Mechanics, EPR Paradox, Physical Reality},
  language      = {English},
  publisher     = {American Physical Society}
}

@inproceedings{deep_learning,
  author        = {Geoffrey Hinton and Yoshua Bengio and Yann LeCun},
  title         = {Deep Learning for Artificial Intelligence},
  booktitle     = {Proceedings of the IEEE International Conference on Neural Networks},
  year          = {2021},
  editor        = {Jane Smith and John Doe},
  volume        = {1},
  number = {5},
  series        = {Advances in Neural Information Processing},
  pages         = {100-120},
  address       = {Montreal, Canada},
  month         = {December},
  organization  = {IEEE},
  publisher     = {IEEE Press},
  note          = {Keynote paper on recent advancements in deep learning},
  isbn          = {978-1-5386-4637-1},
  doi           = {10.1109/ICNN.2021.9483948},
  url           = {https://ieeexplore.ieee.org/document/9483948},
  annote        = {A seminal work on how deep learning transforms AI},
  abstract      = {This paper explores cutting-edge deep learning techniques and their impact on the development of artificial intelligence...},
  keywords      = {Deep Learning, Artificial Intelligence, Neural Networks},
  language      = {English}
}


@incollection{quantum_computation,
  author        = {Michael A. Nielsen and Isaac L. Chuang},
  title         = {Quantum Computation and Quantum Information},
  booktitle     = {Handbook of Quantum Information Science},
  publisher     = {Springer},
  year          = {2026},
  editor        = {Charles H. Bennett and David P. DiVincenzo},
  volume        = {4},
  series        = {Quantum Science and Technology},
  chapter       = {10},
  pages         = {250-300},
  address       = {Berlin, Germany},
  month         = {October},
  note          = {A comprehensive overview of the fundamentals of quantum computation},
  isbn          = {978-3-540-88702-7},
  doi           = {10.1007/springerreference-303198},
  url           = {https://www.springer.com/gp/book/9783540887027},
  annote        = {Essential reading for researchers entering the field of quantum information},
  abstract      = {This chapter delves into the principles of quantum computing, offering an accessible yet thorough introduction...},
  keywords      = {Quantum Computing, Quantum Information, Computational Models},
  language      = {English},
  price         = {45.00},
  size          = {50 pages}
}


In [56]:
import torch
import pandas as pd
torch.cuda.is_available()
from transformers import pipeline
import re
import string
import spacy
import requests

def custom_strip(text):
    strip_chars = string.whitespace + string.punctuation + '“”'
    return text.strip(strip_chars)


def getIndexOfSubstring(text, regEx = [], reverse = False):
    #if reverse = False then it finds the first occurance of a given regEx.
    #if reverse = True, then it finds the last occurance of a given regEx.
    #beceause the occurance with the minimal startIndex is taken, it always chooses the most specific regex.
    minStartIndex = float('inf')
    endIndex = -1
    matches = []
    substring = ""
    for regExElement in regEx:
        matches = list(re.finditer(regExElement, text))
        if matches:
            if reverse:
                match = matches[-1]
            else:
                match = matches[0]
            if match.start() < minStartIndex:
                minStartIndex = match.start()
                endIndex = match.end()
                substring = text[match.start():match.end()]
    if substring != "":
        return minStartIndex, endIndex, substring   
    return -1, -1, substring

def is_SurenameFirst(names):
    splitedNames = names.split(" ")
    #regex wie w+ erkennt bspw. KEIN è 
    if splitedNames[0].endswith(".") or not isSpeceficPunctuation(splitedNames[0][-1], []):
        return True
    return False
    
def is_NameShortened(df_PER):
    for index in df_PER.index.values.tolist():
        if "." == text[df_PER["end"].iloc[index]] and len(text[df_PER["start"].iloc[index]:df_PER["end"].iloc[index] + 1]) == 2:
            return True
    return False


def isSpeceficPunctuation(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace
    for character in replaceCharacter:
        allowed_chars = allowed_chars.replace(character, '')
    return all(char in allowed_chars for char in text)

def is_Editor(editorRegEx, textBetweenNames, index, text):
    if re.search(editorRegEx, textBetweenNames):
        x = re.search(editorRegEx, text)
        #print(f'x: {x.start()}')
        if isSpeceficPunctuation(text[index:x.start()], ["&"]):
            return True
    return False

def processNames(authors):
    finalAuthors = ""
    search_terms = [" and ", ", and ", " & ", ", & "]
    #andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
    surenameFirst = is_SurenameFirst(authors.strip())
    print(f'function processNames, surenameFirst: {surenameFirst}')
    if surenameFirst:
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        print("Fall surenameFirst".format(authors))
        #hier völlig egal, ob er einzelne Initialen in ein eigenes Word gesteckt hat, obwohl es noch Nachnamen gib
        authors = authors.replace(andInAuthors, " and ")
        #print(f'authors: {authors}')
        finalAuthors = authors.replace(", ", " and ")
    elif "., " in authors:
        print("Fall ., {0}".format(authors))
        search_terms = ["., and ", "., & ", ". and ", ". & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":
            authors = authors.replace(andInAuthors, "., ")
        authors = authors.split("., ")
        authors = [name + "." for name in authors]
        authors = [name.replace("..",".") for name in authors]
        for author in authors[:-1]:
            buffer = author.split(", ")
            finalAuthors = finalAuthors + buffer[1] + " " + buffer[0] + " and "
        buffer = authors[-1].split(", ")
        finalAuthors = finalAuthors + buffer[1] + " " +  buffer[0]
    elif ".; " in authors:
        print("Fall .; {0}".format(authors))
        search_terms = ["., and ", "., & ", ". and ", ". & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":
            authors = authors.replace(andInAuthors, ".; ")
        authors = authors.split(".; ")
        authors = [name + "." for name in authors]
        authors = [name.replace("..",".") for name in authors]
        for author in authors[:-1]:
            buffer = author.split(", ")
            finalAuthors = finalAuthors + buffer[1] + " " + buffer[0] + " and "
        buffer = authors[-1].split(", ")
        finalAuthors = finalAuthors + buffer[1] + " " +  buffer[0]
    elif ", " in authors:
        print("Fall , {0}".format(authors))
        search_terms = [", and ", ", & ", " and ", " & "]
        andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
        if andInAuthors != "":  
            authors = authors.replace(andInAuthors, ", ")
        authors = authors.split(", ")
        for i in range(0, len(authors) - 3, 2):
            finalAuthors = finalAuthors + authors[i+1] + " " + authors[i] + " and "
        finalAuthors = finalAuthors + authors[len(authors) - 1] + " " + authors[len(authors) - 2]
    return finalAuthors

def getAuthors(text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    
    onlyPunctuation = False
    onlyAnd = False
    authorsDetected = False
    setChainStart = True
    startIndexAuthors = -1
    endIndexAuthors = -1
    chainStartIndex = -1
    changedText = ""
    
    df_PER = getPersonTags(text)
    index_df_PER_List = df_PER.index.values.tolist()
    
    for index in index_df_PER_List:
        #beachte: Hiermit lese ich immer schon vor!
        if index < len(index_df_PER_List) - 1:
            textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
        else:
            textBetweenNames = text[df_PER["end"].iloc[index]:]
        onlyPunctuation = isSpeceficPunctuation(textBetweenNames, ["&"])
        firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
        onlyAnd = textBetweenNames == andTyp
        if setChainStart: 
            chainStartIndex = df_PER["start"].iloc[index]
            setChainStart = False
        if not onlyPunctuation and not onlyAnd:
            setChainStart = True
            startIndexAuthors = chainStartIndex
            endIndexAuthors = df_PER["end"].iloc[index]
            break
    if startIndexAuthors > -1:
        changedText, author = replaceSubstring(startIndexAuthors, endIndexAuthors, text, "")
        author = processNames(author)
        return changedText, author
    return text, ""

def getEditors(text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    editorRegEx = " (\()?(Eds\.|Eds|Ed|ed|Ed\.|ed\.|eds\.|editor|editors)(\))?"
    onlyPunctuation = False
    onlyAnd = False
    setChainStart = True
    isEditor = False
    startIndexEditors = -1
    endIndexEditors = -1
    chainStartIndex = -1
    
    df_PER = getPersonTags(text)
    index_df_PER_List = df_PER.index.values.tolist()

    for index in index_df_PER_List:
        if index < len(index_df_PER_List) - 1:
            textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
        else:
            textBetweenNames = text[df_PER["end"].iloc[index]:]
        onlyPunctuation = isSpeceficPunctuation(textBetweenNames, ["&"])
        firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
        onlyAnd = textBetweenNames == andTyp
        #wenn true, dann beginnt eine neue Autorenkette
        if setChainStart: 
            chainStartIndex = df_PER["start"].iloc[index]
            #Solange das auf False, sollen der Substring erweitert werden, also start bleibt konstant
            setChainStart = False
        #Dann ist die Autorenkette zu Ende
        if not onlyPunctuation and not onlyAnd:
            setChainStart = True
            #Es können auch nur Editoren und keine Autoren vorkommen
            isEditor = is_Editor(editorRegEx, textBetweenNames, df_PER["end"].iloc[index], text)
            if isEditor:
                startIndexEditors = chainStartIndex
                endIndexEditors = df_PER["end"].iloc[index]
                break
    #print(f'getAuthorsAndEditors: return: {[startIndexAuthors,endIndexAuthors],[startIndexEditors, endIndexEditors]}')
    if startIndexEditors > -1:
        changedText, editor = replaceSubstring(startIndexEditors, endIndexEditors, text, "#EDITOR#")
        print(f'text after replace editors : {text}')
        #es soll erst ab Editors gesucht werden, daher text[endIndexEditors:]. Sonst Verwechslungsgefahr
        endIndexEditors = getIndexOfSubstring(text, ["#EDITOR#"])[1]
        startIndexEditorMarker, endIndexEditorMarker, buffer = getIndexOfSubstring(changedText[endIndexEditors:], [editorRegEx])
        startIndexEditorMarker = startIndexEditorMarker + endIndexEditors
        endIndexEditorMarker = endIndexEditorMarker + endIndexEditors
        editor = processNames(editor)
        changedText, replacedEditorMarker = replaceSubstring(startIndexEditorMarker, endIndexEditorMarker, changedText, ".")
        startIndexEditor, endIndexEditor, buffer = getIndexOfSubstring(text, ["#EDITOR#"])
        changedText, replacedEditorMarker = replaceSubstring(startIndexEditor, endIndexEditor, text, ".")
        return changedText, editor
    return text, ""

def getPublisher(text, doi):
    publisher = ""
    if doi != "":
        url = f"https://api.crossref.org/works/{doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            publisher = data['message'].get('publisher', 'Publisher not found')
    if publisher != "":
        startIndex, endIndex, publisher = getIndexOfSubstring(text, [publisher], True)
        #double check
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, publisher
        changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
        if publisher != "":
            return changedText, custom_strip(publisher)
    df_ORG = getORGTag(text, 0.8)
    if not df_ORG.empty:
        startIndex = df_ORG["start"].iloc[0]
        endIndex = df_ORG["end"].iloc[0]
        publisher = text[startIndex:endIndex]
        #If the range determined by the tagger corresponds to a string 
        #that is only delimited by punctuation before and after, then it is most likely a publisher.
        #startIndex - 2 because of a space inbetween
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                changedText, publisher = replaceSubstring(startIndex, endIndex, text, "")
                return changedText, custom_strip(publisher)
    return text, ""
            


def replaceSubstring (startIndex, endIndex, text, substituteString, ignorePunctuation = ["&", "(", ")"]):
    #The regex also checks for punctuation so that it is particularly precise. 
    #The cut text however should be without delimiters of the bibTex fields in the bibiography, 
    #so that future regex are not affected.
    if endIndex > 0:
        startIndexReplace = -1
        endIndexReplace = -1
        changedText = text
        if startIndex > 0:
            for i in range(startIndex - 1, 0, -1):
                if not isSpeceficPunctuation(text[i], ignorePunctuation):
                    startIndexReplace = i + 1
                    for n in range(startIndexReplace, len(text), 1):
                        if not isSpeceficPunctuation(text[n], ignorePunctuation):
                            startIndexReplace = n
                            break
                    break
        else:
            startIndexReplace = 0
            
        #print(f' replaceSubstring, startIndexReplace={{{startIndexReplace}}}')
        if endIndex < len(text):
            for i in range(endIndex, len(text), 1):
                if not isSpeceficPunctuation(text[i], ignorePunctuation):
                    endIndexReplace = i
                    for n in range(endIndexReplace-1, 0, -1):
                        if not isSpeceficPunctuation(text[n], ignorePunctuation):
                            endIndexReplace = n + 1
                            break
                    break
        else:
            endIndexReplace = len(text)
        #print(f' replaceSubstring, endIndexReplace={{{endIndexReplace}}}')
        changedText = text[0:startIndexReplace] + substituteString + text[endIndexReplace:len(text)]
        return changedText, text[startIndexReplace:endIndexReplace]
    return text, ""

def getAddress(text):
    overflow = 0
    df_LOC = getLOCTag(text)
    addressFound = False
    if not df_LOC.empty:
        startIndex = df_LOC["start"].iloc[-1]
        endIndex = df_LOC["end"].iloc[-1]
        address = text[startIndex:endIndex]
        #If the range determined by the tagger corresponds to a string 
        #that is only delimited by punctuation before and after, then it is most likely a publisher.
        #startIndex - 2 because of a space inbetween
        print(f' getAddress, len(text)={{{len(text)}}}')
        print(f' getAddress, endIndex={{{endIndex}}}')
        print(f' getAddress, text={{{text}}}')
        if endIndex < len(text) -1:
            if isSpeceficPunctuation(text[startIndex - 2]) and isSpeceficPunctuation(text[endIndex + 1]):
                addressFound = True
        else:
            if isSpeceficPunctuation(text[startIndex - 2]):
                addressFound = True
        if len(df_LOC) > 1 and addressFound:
            startIndex2 = df_LOC["start"].iloc[0]
            endIndex2 = df_LOC["end"].iloc[0]
            address2 = text[startIndex2:endIndex2]
            if isSpeceficPunctuation(text[startIndex2 - 2]) and isSpeceficPunctuation(text[endIndex2:startIndex]):
                changedText, address = replaceSubstring(startIndex2, endIndex, text, "")
                return changedText, custom_strip(address)
        if addressFound:
            changedText, address = replaceSubstring(startIndex, endIndex, text, "")
            return changedText, custom_strip(address)
    return text, ""

def getDate(text):
    monthYearRegex = "\b(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?" \
    "|May|May\.?|June|Jun\.?|July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept\.?|October|" \
    "Oct\.?|November|Nov\.?|December|Dec\.?)\s\d{4}\b"
    changedText, monthYear  = getSubstringByRegEx(text, [monthYearRegex])
    if monthYear == "":
        yearRegEx1 = "(\(\d{4}\)|\. \d{4}\.)"
        yearRegEx2 = "(\.|,) \d{4}(\.|,|;)"
        changedText, year  = getSubstringByRegEx(text, [yearRegEx1, yearRegEx2])
        return changedText, "", f'year={{{year}}}'
    monthYear = monthYear.split(' ')
    return changedText, f'month={{{monthYear[0]}}}', f'year={{{monthYear[1]}}}'
    
def getTitel(text):
    text = custom_strip(text)
    limit = len(text) - 1
    i = 0
    #remove pairs of punctuation marks 
    while i < limit:
        if isSpeceficPunctuation(text[i]) and isSpeceficPunctuation(text[i+1]):
            text = text[:i] + "." + text[i+2:]
            i = i - 1
            limit = limit - 1
        i = i +1
    if text.count(".") == 1:
        text = text.split(".")
    elif text.count(",") == 1: 
        text = text.split(",")
    elif text.count(".") == 2:
        text = text.split(".")
        return custom_strip(text[0]), custom_strip(text[1]), custom_strip(text[2])
    elif text.count(",") > 1:
        text = text.rsplit(',', 1)
    return custom_strip(text[0]), custom_strip(text[1]), ""

def getPersonTags(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty: 
        return df_outputs[df_outputs["entity_group"] == "PER"].reset_index(drop=True)
    return pd.DataFrame()

def getORGTag(text, score):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "ORG") & (df_outputs["score"] >= score)].reset_index(drop=True).tail(1)
    return pd.DataFrame()

def getLOCTag(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "LOC")].reset_index(drop=True).tail(2)
    return pd.DataFrame()

def getDoi(text):
    doiUrlRegEx1 = "https:\/\/doi\.org(\/[^\s]*)?$"
    doiUrlRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org)?([^\s]*)+$"
    text, doi  = getSubstringByRegEx(text, [doiUrlRegEx1, doiUrlRegEx2])
    httpsDomainRegEx1 = "https:\/\/doi\.org\/"
    httpsDomainRegEx2 = "(DOI|doi):\s?(https:\/\/doi\.org\/)?"
    doi, httpsDomain = getSubstringByRegEx(doi, [httpsDomainRegEx1, httpsDomainRegEx2])
    return text, custom_strip(doi)

def getSubstringByRegEx(text, regex = []):
    startIndex, endIndex, substring = getIndexOfSubstring(text, regex, True)
    changedText, substring = replaceSubstring(startIndex, endIndex, text, "")
    return changedText, custom_strip(substring)

#search_terms = [", et al.", " et al."]
#firstStartIndex, firstEndIndex, etAl = find_First_Term(text, search_terms)
#if firstStartIndex > -1:
    #text = replaceSubstring(firstStartIndex, firstEndIndex, text, ", ")

def create_bibtex(text):
    author = ""
    editor = ""
    title = ""
    booktitle = ""
    journal = ""
    series = ""
    year = ""
    volume = ""
    number = ""
    edition = ""
    pages = ""
    month = ""
    publisher = ""
    school = ""
    address = ""
    note = ""
    annote = ""
    doi = ""
    url = ""
    book = False
    article = False
    proceedings = False
    inproceedings = False
    incollection = False
    phdThesis = False
    
    urlRegEx = "https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?"
    pageRegEx = "(?:pp\.? )?\d+(-|–)\d+"
    volumeRegEx = "(V|v)ol\. \d+"
    number1RegEx = "no\. \d+"
    number2RegEx = "Issue \d+"
    number3RegEx = "\d+"
    edition1 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) ed\."
    edition2 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) edn\."
    
    text, author = getAuthors(text)
    text, editor = getEditors(text)
    text, doi = getDoi(text)
    text, url = getSubstringByRegEx(text, [urlRegEx])
    text, month, year = getDate(text)
    text, page = getSubstringByRegEx(text, [pageRegEx])
    text, volume = getSubstringByRegEx(text, [volumeRegEx])
    text, number = getSubstringByRegEx(text, [number1RegEx, number2RegEx, number3RegEx])
    text, edition = getSubstringByRegEx(text, [edition1, edition2])
    #BUGFIX: Wenn nur num vorkommt, dann schneidet volume die Zahl von num aus!!!!!!
    #volume3 darf also erst geprüft werden, wenn num1 und num2 geprüft wurden.
    #VOlumer erscheint aber immre vor number
    text, address = getAddress(text)
    text, publisher = getPublisher(text, doi)
    titel, booktitel, series = getTitel(text)

    return f'authors: {author}' + ", \r\n" + f'editors: {editor}' \
+ ", \r\n" + f'doi: {doi}' +  ", \r\n"  +  f'{year}' +  ", \r\n"  + f'number : {number}' \
+  ", \r\n" + f'volume : {volume}' +  ", \r\n"  + f'edition: {edition}' +  ", \r\n"  + f'page: {page}' \
+  ", \r\n"  + f'url: {url}' +  ", \r\n"  + f'publisher: {publisher}' +  ", \r\n"  + f'address: {address}' \
+  ", \r\n" + f'titel: {titel}' +  ", \r\n"  + f'booktitel: {booktitel}' 



In [57]:
text="Hinton, G., Bengio, Y., & LeCun, Y. (2021). Deep Learning for Artificial Intelligence. In J. Smith & J. Doe (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948"
text = "M. A. Nielsen and I. L. Chuang, “Quantum Computation and Quantum Information,” in Handbook of Quantum Information Science, vol. 4, C. H. Bennett and D. P. DiVincenzo, Eds. Berlin, Germany: Springer, 2026, pp. 250–300. doi: 10.1007/springerreference-303198."
#text = "Nielsen, M. A.; Chuang, I. L. Quantum Computation and Quantum Information. In Handbook of Quantum Information Science; Bennett, C. H., DiVincenzo, D. P., Eds.; Quantum Science and Technology; Springer: Berlin, Germany, 2026; Vol. 4, pp 250–300. https://doi.org/10.1007/springerreference-303198."
print(create_bibtex(text))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted

function processNames, surenameFirst: True
Fall surenameFirst


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


text after replace editors : , “Quantum Computation and Quantum Information,” in Handbook of Quantum Information Science, vol. 4, C. H. Bennett and D. P. DiVincenzo, Eds. Berlin, Germany: Springer, 2026, pp. 250–300. doi: 10.1007/springerreference-303198.
function processNames, surenameFirst: True
Fall surenameFirst


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 getAddress, len(text)={166}
 getAddress, endIndex={151}
 getAddress, text={, “Quantum Computation and Quantum Information,” in Handbook of Quantum Information Science, , C. H. Bennett and D. P. DiVincenzo, Eds. Berlin, Germany: Springer, ,  }


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


authors: M. A. Nielsen and I. L. Chuang, 
editors: C. H. Bennett and D. P. DiVincenzo, 
doi: 10.1007/springerreference-303198, 
year={2026}, 
number : , 
volume : vol. 4, 
edition: , 
page: pp. 250–300, 
url: , 
publisher: Springer, 
address: Berlin, Germany, 
titel: Quantum Computation and Quantum Information, 
booktitel: in Handbook of Quantum Information Science.C.H.Bennett and D.P.DiVincenzo.Eds


Idee: Nun zunächst Jahr, Volume, Seiten, Edition und URL/DOI extrahieren. Den Rest (also Titel, Publisher, Series), dann nochmal den SpacyParer drüber laufen lassen, weil 

In [16]:
s = "This Book was edited by Martin Trump" 
print(getIndexOfSubstring(s, ["edited by"]))
ner_tagger = pipeline("ner", aggregation_strategy="simple")
text = s
outputs = ner_tagger(text)
df_outputs = pd.DataFrame(outputs)
print(df_outputs)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


(14, 22, 'edited by')


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  entity_group     score          word  start  end
0         MISC  0.871492          Book      5    9
1          PER  0.999499  Martin Trump     24   36


In [13]:
#Test, um Datum Zuverlässig erkannt wird

text = "Geoffrey Hinton, Yoshua Bengio, and Yann LeCun. 2021. Deep Learning for Artificial Intelligence. In Proceedings of the IEEE International Conference on Neural Networks (Advances in Neural Information Processing), IEEE University Press, Montreal, Canada, 100–120. DOI:https://doi.org/10.1109/ICNN.2021.9483948"
text = "dsfsdf London"
print(len(text))
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

ner_tagger = pipeline("ner", aggregation_strategy="simple")
outputs = ner_tagger(text)
df_outputs = pd.DataFrame(outputs)
print(df_outputs)
pos_pipeline = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english")
results = pos_pipeline(text)
for result in results:
    print(result)
    print("---------------------------------------------------------------------------")
    #print(f"Word: {result['word']}, POS Tag: {result['entity']}")

13


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


dsfsdf dsfsdf NOUN NN compound xxxx True False
London London PROPN NNP ROOT Xxxxx True False
London 7 13 GPE


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  entity_group     score    word  start  end
0          LOC  0.926873  London      7   13


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'I-LOC', 'score': 0.92687315, 'index': 6, 'word': 'London', 'start': 7, 'end': 13}
---------------------------------------------------------------------------


In [2]:
x = "hallo"
print(f'function processNames, {{ {x} }} ')

function processNames, { hallo } 


In [20]:
doi = "10.1109/SeFeT55524.2022.9908774"

import requests

def get_publisher_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        publisher = data['message'].get('publisher', 'Publisher not found')
        return publisher
    else:
        return 'DOI not found or invalid'

# Beispiel-DOI
publisher = get_publisher_from_doi(doi)
print(f"The publisher for DOI {doi} is: {publisher}")


The publisher for DOI 10.1109/SeFeT55524.2022.9908774 is: IEEE


In [21]:
print("ldsf ffdsf df, kkkk, sdfsdf, in: nbaldd, #NUM#, London"[53])

n


In [1]:
import string

def strip_punctuation(text):
    # Erstelle eine Übersetzungstabelle, die alle Interpunktionszeichen entfernt
    translator = str.maketrans('', '', string.punctuation)
    # Wende die Übersetzungstabelle auf den Text an
    return text.translate(translator)

# Beispielverwendung
text = "Hallo, wie geht's dir? Hoffentlich gut!"
stripped_text = strip_punctuation(text)
print(stripped_text)

Hallo wie gehts dir Hoffentlich gut


In [2]:
print("Hallo"[1])

a
