@book{advanced_physics, author = {Stephen Hawking and Brian Greene and Maria Trump and Susan Soy}, title = {Advanced Concepts in Theoretical Physics}, publisher = {Cambridge University Press}, year = {2025}, edition = {3}, volume = {1}, series = {Advanced Studies in Physics}, address = {Cambridge, UK}, month = {May}, note = {A seminal work in the field of theoretical physics}, isbn = {978-0-521-76948-0}, doi = {10.1017/example.book.2025}, url = {https://www.cambridge.org/advanced_physics}, annote = {Widely cited in the physics community}, abstract = {This book explores cutting-edge theories and concepts in theoretical physics...}, keywords = {Theoretical Physics, Quantum Mechanics, String Theory}, language = {English}, price = {75.00}, size = {600 pages}, lccn = {2020934576}, mrnumber = {MR3070071} }

@article{quantum_entanglement,
  author        = {Albert Einstein and Boris Podolsky and Nathan Rosen},
  title         = {Can Quantum-Mechanical Description of Physical Reality Be Considered Complete?},
  journal       = {Physical Review},
  year          = {1935},
  volume        = {47},
  number        = {10},
  pages         = {777-780},
  month         = {May},
  note          = {EPR Paradox paper, fundamental for quantum mechanics},
  doi           = {10.1103/PhysRev.47.777},
  url           = {https://journals.aps.org/pr/abstract/10.1103/PhysRev.47.777},
  abstract      = {In this paper, the authors discuss the EPR paradox and challenge the completeness of quantum mechanics...},
  keywords      = {Quantum Mechanics, EPR Paradox, Physical Reality},
  language      = {English},
  publisher     = {American Physical Society}
}

@inproceedings{deep_learning,
  author        = {Geoffrey Hinton and Yoshua Bengio and Yann LeCun},
  title         = {Deep Learning for Artificial Intelligence},
  booktitle     = {Proceedings of the IEEE International Conference on Neural Networks},
  year          = {2021},
  editor        = {Jane Smith and John Doe},
  volume        = {1},
  number = {5},
  series        = {Advances in Neural Information Processing},
  pages         = {100-120},
  address       = {Montreal, Canada},
  month         = {December},
  organization  = {IEEE},
  publisher     = {IEEE Press},
  note          = {Keynote paper on recent advancements in deep learning},
  isbn          = {978-1-5386-4637-1},
  doi           = {10.1109/ICNN.2021.9483948},
  url           = {https://ieeexplore.ieee.org/document/9483948},
  annote        = {A seminal work on how deep learning transforms AI},
  abstract      = {This paper explores cutting-edge deep learning techniques and their impact on the development of artificial intelligence...},
  keywords      = {Deep Learning, Artificial Intelligence, Neural Networks},
  language      = {English}
}


@incollection{quantum_computation,
  author        = {Michael A. Nielsen and Isaac L. Chuang},
  title         = {Quantum Computation and Quantum Information},
  booktitle     = {Handbook of Quantum Information Science},
  publisher     = {Springer},
  year          = {2026},
  editor        = {Charles H. Bennett and David P. DiVincenzo},
  volume        = {4},
  series        = {Quantum Science and Technology},
  chapter       = {10},
  pages         = {250-300},
  address       = {Berlin, Germany},
  month         = {October},
  note          = {A comprehensive overview of the fundamentals of quantum computation},
  isbn          = {978-3-540-88702-7},
  doi           = {10.1007/springerreference_303198},
  url           = {https://www.springer.com/gp/book/9783540887027},
  annote        = {Essential reading for researchers entering the field of quantum information},
  abstract      = {This chapter delves into the principles of quantum computing, offering an accessible yet thorough introduction...},
  keywords      = {Quantum Computing, Quantum Information, Computational Models},
  language      = {English},
  price         = {45.00},
  size          = {50 pages}
}


In [50]:
import torch
import pandas as pd
torch.cuda.is_available()
from transformers import pipeline
import re
import string
import spacy


def getIndexOfSubstring(text, regEx = [], reverse = False):
    min_index = float('inf')
    end_index = 0
    matches = []
    substring = ""
    for regExElement in regEx:
        matches = list(re.finditer(regExElement, text))
        if matches:
            if reverse:
                match = matches[-1]
            else:
                match = matches[0]
            if match.start() < min_index:
                min_index = match.start()
                end_index = match.end()
                substring = text[match.start():match.end()]
    if matches:
        return match.start(), match.end(), substring   
    return -1, -1, substring

def is_SurenameFirst(names):
    splitedNames = names.split(" ")
    #regex wie w+ erkennt bspw. KEIN è 
    if splitedNames[0].endswith(".") or not is_punctuation(splitedNames[0][-1], []):
        return True
    return False
    
def is_NameShortened(df_PER):
    for index in df_PER.index.values.tolist():
        if "." == text[df_PER["end"].iloc[index]] and len(text[df_PER["start"].iloc[index]:df_PER["end"].iloc[index] + 1]) == 2:
            return True
    return False


def is_punctuation(text, replaceCharacter = []):
    allowed_chars = string.punctuation + ' '
    for character in replaceCharacter:
        allowed_chars = allowed_chars.replace(character, '')
    return all(char in allowed_chars for char in text)

def is_Editor(editorRegEx, textBetweenNames, index, text):
    if re.search(editorRegEx, textBetweenNames):
        x = re.search(editorRegEx, text)
        #print(f'x: {x.start()}')
        if is_punctuation(text[index:x.start()], ["&"]):
            return True
    return False

def processNames(authors):
    finalAuthors = ""
    search_terms = [" and ", ", and ", " & ", ", & "]
    andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
    surenameFirst = is_SurenameFirst(authors.strip())
    print(f'function processNames, surenameFirst: {surenameFirst}')
    if surenameFirst:
        print("Fall surenameFirst".format(authors))
        #hier völlig egal, ob er einzelne Initialen in ein eigenes Word gesteckt hat, obwohl es noch Nachnamen gib
        authors = authors.replace(andInAuthors, " and ")
        #print(f'authors: {authors}')
        finalAuthors = authors.replace(", ", " and ")
    elif andInAuthors != "":
        if "., " in authors:
            print("Fall ., {0}".format(authors))
            search_terms = ["., and ", "., & ", ". and ", ". & "]
            andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
            authors = authors.replace(andInAuthors, "., ")
            authors = authors.split("., ")
            authors = [name + "." for name in authors]
            authors = [name.replace("..",".") for name in authors]
            for author in authors[:-1]:
                buffer = author.split(", ")
                finalAuthors = finalAuthors + buffer[1] + " " + buffer[0] + " and "
            buffer = authors[-1].split(", ")
            finalAuthors = finalAuthors + buffer[1] + " " +  buffer[0]
        elif ", " in authors:
            print("Fall , {0}".format(authors))
            search_terms = [", and ", ", & ", " and ", " & "]
            andInAuthors = getIndexOfSubstring(authors, search_terms)[2]
            authors = authors.replace(andInAuthors, ", ")
            authors = authors.split(", ")
            for i in range(0, len(authors) - 3, 2):
                finalAuthors = finalAuthors + authors[i+1] + " " + authors[i] + " and "
            finalAuthors = finalAuthors + authors[len(authors) - 1] + " " + authors[len(authors) - 2]
    else:
        print("Fall else {0}".format(authors))
        authors = authors.split(", ")
        finalAuthros = authors[1] + authors[0]
    return finalAuthors

def getAuthors(df_PER, text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    index_df_PER_List = df_PER.index.values.tolist()
    onlyPunctuation = False
    onlyAnd = False
    authorsDetected = False
    setChainStart = True
    startIndexAuthors = -1
    endIndexAuthors = -1
    chainStartIndex = -1


    for index in index_df_PER_List:
        #beachte: Hiermit lese ich immer schon vor!
        if index < len(index_df_PER_List) - 1:
            textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
        else:
            textBetweenNames = text[df_PER["end"].iloc[index]:]
        onlyPunctuation = is_punctuation(textBetweenNames, ["&"])
        firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
        onlyAnd = textBetweenNames == andTyp
        if setChainStart: 
            chainStartIndex = df_PER["start"].iloc[index]
            setChainStart = False
        if not onlyPunctuation and not onlyAnd:
            setChainStart = True
            startIndexAuthors = chainStartIndex
            endIndexAuthors = df_PER["end"].iloc[index]
            break
    return startIndexAuthors,endIndexAuthors

def getEditors(df_PER, text):
    search_terms = [" and ", ", and ", " & ", ", & ", "., & ", "., and ", ". and ", ". & "]
    editorRegEx = " (\()?(Eds\.|Eds|Ed|ed|Ed\.|ed\.|eds\.|editor|editors)(\))?"
    index_df_PER_List = df_PER.index.values.tolist()
    onlyPunctuation = False
    onlyAnd = False
    setChainStart = True
    isEditor = False
    startIndexEditors = -1
    endIndexEditors = -1
    chainStartIndex = -1


    for index in index_df_PER_List:
        if index < len(index_df_PER_List) - 1:
            textBetweenNames = text[df_PER["end"].iloc[index]:df_PER["start"].iloc[index + 1]]
        else:
            textBetweenNames = text[df_PER["end"].iloc[index]:]
        onlyPunctuation = is_punctuation(textBetweenNames, ["&"])
        firstStartIndex, firstEndIndex, andTyp = getIndexOfSubstring(textBetweenNames, search_terms)
        onlyAnd = textBetweenNames == andTyp
        #wenn true, dann beginnt eine neue Autorenkette
        if setChainStart: 
            chainStartIndex = df_PER["start"].iloc[index]
            #Solange das auf False, sollen der Substring erweitert werden, also start bleibt konstant
            setChainStart = False
        #Dann ist die Autorenkette zu Ende
        if not onlyPunctuation and not onlyAnd:
            setChainStart = True
            #Es können auch nur Editoren und keine Autoren vorkommen
            isEditor = is_Editor(editorRegEx, textBetweenNames, df_PER["end"].iloc[index], text)
            if isEditor:
                startIndexEditors = chainStartIndex
                endIndexEditors = df_PER["end"].iloc[index]
                break
    #print(f'getAuthorsAndEditors: return: {[startIndexAuthors,endIndexAuthors],[startIndexEditors, endIndexEditors]}')
    return startIndexEditors, endIndexEditors

def replaceSubstring (startIndex, endIndex, text, substituteString, replacedPunctuation = ["&", "(", ")"]):
    if endIndex > 0:
        startIndexReplace = -1
        endIndexReplace = -1
        changedText = text
        if startIndex > 0:
            for i in range(startIndex - 1, 0, -1):
                if is_punctuation(text[i], replacedPunctuation):
                    startIndexReplace = i + 1
                    break
        else:
            startIndexReplace = 0
        print(f'startIndexReplace : {startIndexReplace}')
        if endIndex < len(text):
            for i in range(endIndex, len(text), 1):
                if is_punctuation(text[i], replacedPunctuation):
                    endIndexReplace = i
                    break
        else:
            endIndexReplace = len(text)
        print(f'endIndexReplace : {endIndexReplace}')
        changedText = text[0:startIndexReplace] + substituteString + text[endIndexReplace:len(text)]
        print(f'text after replaceSubstring : {text[startIndexReplace:endIndexReplace]}')
        return changedText, text[startIndexReplace:endIndexReplace]
    return text, ""


def getPersonTags(text):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    return df_outputs[df_outputs["entity_group"] == "PER"].reset_index(drop=True)

def getORGTag(text, score):
    ner_tagger = pipeline("ner", aggregation_strategy="simple")
    outputs = ner_tagger(text)
    df_outputs = pd.DataFrame(outputs)
    if not df_outputs.empty:
        return df_outputs[(df_outputs["entity_group"] == "ORG") & (df_outputs["score"] >= score)].reset_index(drop=True).tail(1)
    return df_outputs

#search_terms = [", et al.", " et al."]
#firstStartIndex, firstEndIndex, etAl = find_First_Term(text, search_terms)
#if firstStartIndex > -1:
    #text = replaceSubstring(firstStartIndex, firstEndIndex, text, ", ")

def create_bibtex(text):
    df_PER = getPersonTags(text)

    doiUrlRegEx = "https:\/\/doi\.org(\/[^\s]*)?$"
    doiUrlRegEx2 = "(DOI|doi):(https:\/\/doi\.org)?([^\s]*)+$"
    editorRegEx = "(\()?(Eds\.|Eds|Ed|ed|Ed\.|ed\.|eds\.|editor|editors)(\))?"
    year1 = "(\(\d{4}\)|\. \d{4}\.)"
    year2 = "(\.|,) \d{4}(\.|,)"


    finalAuthors = ""
    finalEditors = ""
    startIndexAuthors,endIndexAuthors = getAuthors(df_PER, text)
    if startIndexAuthors > -1:
        text, authors = replaceSubstring(startIndexAuthors, endIndexAuthors, text, "#AUTHOR#")
        print(f'text after replace authors : {text}')
        finalAuthors = processNames(authors)
    else:
        startIndexAuthors, endIndexAuthors = 0, 0
    
    df_PER = getPersonTags(text)
    startIndexEditors, endIndexEditors = getEditors(df_PER, text)
    if startIndexEditors > -1:
        text, editors = replaceSubstring(startIndexEditors, endIndexEditors, text, "#EDITOR#")
        print(f'text after replace editors : {text}')
        #es soll erst ab Editors gesucht werden, daher text[endIndexEditors:]. Sonst Verwechslungsgefahr
        print(f'text[endIndexEditors:] : {text[endIndexEditors:]}')
        endIndexEditors = getIndexOfSubstring(text, ["#EDITOR#"])[1]
        startIndexEditorMarker, endIndexEditorMarker, finalEditors = getIndexOfSubstring(text[endIndexEditors:], [editorRegEx])
        print(f'startIndexEditorMarker : {startIndexEditorMarker}')
        print(f'endIndexEditorMarker : {endIndexEditorMarker}')
        startIndexEditorMarker = startIndexEditorMarker + endIndexEditors
        endIndexEditorMarker = endIndexEditorMarker + endIndexEditors
        print(f'startIndexEditorMarker : {startIndexEditorMarker}')
        print(f'endIndexEditorMarker : {endIndexEditorMarker} \r\n')
        print(f'editors: {editors}')
        finalEditors = processNames(editors)
        print(f'finalEditors : {finalEditors} \r\n')
        text, replacedEditorMarker = replaceSubstring(startIndexEditorMarker, endIndexEditorMarker, text, "")
        print(f'text after replace EditorMarker : {text}')

    else:
        startIndexEditors, endIndexEditors = 0, 0

    print("")

    startIndex, endIndex, finalDoi = getIndexOfSubstring(text, [doiUrlRegEx, doiUrlRegEx2], True)
    text, finalDoi = replaceSubstring(startIndex, endIndex, text, "#DOI#")
    #print(f'text after replace DOI: {text}')

    urlRegEx = "https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?"
    startIndex, endIndex, finalURL = getIndexOfSubstring(text, [urlRegEx], True)
    text, finalURL = replaceSubstring(startIndex, endIndex, text, "#URL#")
    #print(f'text after replace DOI: {text}')

    startIndex, endIndex, finalYear = getIndexOfSubstring(text, [year1])
    if startIndex < 0:
        startIndex, endIndex, finalYear = getIndexOfSubstring(text, [year2], True)
    text, finalYear = replaceSubstring(startIndex, endIndex, text, "#YEAR#")
    finalYear = re.search(r'\d+', finalYear).group(0) if re.search(r'\d+', finalYear) else ""

    pageFinder = "(?:pp\.? )?\d+(-|–)\d+"

    startIndex, endIndex, finalPage = getIndexOfSubstring(text, [pageFinder], True)
    text, finalPage = replaceSubstring(startIndex, endIndex, text, "#PAGE#")
    finalPage = re.search(r'\d+(-|–)\d+', finalPage).group(0) if re.search(r'\d+', finalPage) else ""

    #print(f'text after replace Page: {text}')

    number1 = " no\. \d+"
    number2 = " Issue \d+"
    number3 = "\d+"

    #Volume, Seite, Number stehen IMMEr nach dem Titel. Also diese von Hinten suchen
    startIndex, endIndex, finalNumber = getIndexOfSubstring(text, [number1, number2, number3], True)
    text, finalNumber = replaceSubstring(startIndex, endIndex, text, "#NUM#")
    finalNumber = re.search(r'\d+', finalNumber).group(0) if re.search(r'\d+', finalNumber) else ""

    #print(f'text after replace Number: {text}')

    volume1 = "Vol\. \d+"
    volume2 = "vol\. \d+" 
    volume3 = "\d+"
    edition1 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) ed\."
    edition2 = "(?:[1-9]\d*th|11th|12th|13th|[1-9]\d*(?:st|nd|rd)) edn\."

    startIndex, endIndex, finalVolume = getIndexOfSubstring(text, [volume1, volume2, volume3], True)
    text, finalVolume = replaceSubstring(startIndex, endIndex, text, "#VOL#")
    finalVolume = re.search(r'\d+', finalVolume).group(0) if re.search(r'\d+', finalVolume) else ""

    #print(f'text after replace Volume: {text}')

    startIndex, endIndex, finalEdition = getIndexOfSubstring(text, [edition1, edition2], True)
    text, finalEdition = replaceSubstring(startIndex, endIndex, text, "#ED#")
    finalEdition = re.search(r'\d+', finalEdition).group(0) if re.search(r'\d+', finalEdition) else ""
    
    #Stadt und Land kommen immer nach dem Publisher
    finalTown = ""
    FinalCountry = ""

    #Nun die Delimiter der felder ermitteln. Ist ein Bereich dann nur Org, dann muss es publisher sein
    #print(f'text after replace Edition: {text}')
    text = text.replace('(', '.')
    text = text.replace(')', '.')
    text = re.sub(r'\.{2,}', '.', text)
    print(f'text after replace : {text}')
    textList = [element.strip() for element in text.split('#')]
    textList = [element for element in textList if element.strip()]
    print(textList)
    finalPublisher = ""
    possiblePublisher = textList[-1].strip()
    print(f'possiblePublisher: {possiblePublisher}')
    df_Org = getORGTag(possiblePublisher, 0.8)
    punctuation = string.punctuation.replace(' ', '')  # Entferne das Leerzeichen, falls es enthalten ist
    pattern = f"[{re.escape(punctuation)}]"
    possiblePublisher = re.sub(pattern, '', possiblePublisher)
    publisherDetected = False
    print(f'possiblePublisher: {possiblePublisher}')
    if not df_Org.empty:
        startIndex, endIndex = df_PER["start"].iloc[0], df_PER["end"].iloc[0]
        print(endIndex - startIndex == len(possiblePublisher))
        if endIndex - startIndex == len(possiblePublisher):
            finalPublisher = possiblePublisher[startIndex:endIndex]
            #double Check
            #nlp = spacy.load("en_core_web_sm")
            #doc = nlp(text)
            #for ent in doc.ents:
                #if ent.label_ == "ORG" or ent.label_ == "MISC":
                    #print(ent.end_char - ent.start_char == len(possiblePublisher))
                    #if ent.end_char - ent.start_char != len(possiblePublisher):
                        #startIndex = ent.start_char
                        #endIndex = ent.end_char                         
    if finalPublisher != "":
        textList = textList[:-1]

    print(f'text after replace Publisher : {text}')

    return f'authors: {finalAuthors}' + ", \r\n" + f'editors: {finalEditors}' \
+ ", \r\n" + f'doi: {finalDoi}' +  ", \r\n"  +  f'year: {finalYear}' +  ", \r\n"  + f'number : {finalNumber}' \
+  ", \r\n" + f'volume : {finalVolume}' +  ", \r\n"  + f'edition: {finalEdition}' +  ", \r\n"  + f'page: {finalPage}' \
+  ", \r\n"  + f'url: {finalURL}' +  ", \r\n"  + f'publisher: {finalPublisher}'



In [51]:
text="Hinton, G., Bengio, Y., & LeCun, Y. (2021). Deep Learning for Artificial Intelligence. In J. Smith & J. Doe (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948"

print(create_bibtex(text))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted

startIndexReplace : 0
endIndexReplace : 34
text after replaceSubstring : Hinton, G., Bengio, Y., & LeCun, Y
text after replace authors : #AUTHOR#. (2021). Deep Learning for Artificial Intelligence. In J. Smith & J. Doe (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948
function processNames, surenameFirst: False
Fall ., Hinton, G., Bengio, Y., & LeCun, Y


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


startIndexReplace : 64
endIndexReplace : 81
text after replaceSubstring : J. Smith & J. Doe
text after replace editors : #AUTHOR#. (2021). Deep Learning for Artificial Intelligence. In #EDITOR# (Eds.), Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948
text[endIndexEditors:] : Proceedings of the IEEE International Conference on Neural Networks (Vol. 1, Issue 5, pp. 100–120). Physical Review. https://doi.org/10.1109/ICNN.2021.9483948
startIndexEditorMarker : 1
endIndexEditorMarker : 7
startIndexEditorMarker : 73
endIndexEditorMarker : 79 

editors: J. Smith & J. Doe
function processNames, surenameFirst: True
Fall surenameFirst
finalEditors : J. Smith and J. Doe 

startIndexReplace : 73
endIndexReplace : 79
text after replaceSubstring : (Eds.)
text after replace EditorMarker : #AUTHOR#. (2021). Deep Learning for Artificial Intelligence. In #EDITOR# , Proceedings of the IEEE Inter

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


possiblePublisher: URL
text after replace Publisher : #AUTHOR#. #YEAR#. Deep Learning for Artificial Intelligence. In #EDITOR# , Proceedings of the IEEE International Conference on Neural Networks .Vol. #VOL#, Issue #NUM#, #PAGE#. Physical Review. #URL#
authors: G. Hinton and Y. Bengio and Y. LeCun, 
editors: J. Smith and J. Doe, 
doi: , 
year: 2021, 
number : 5, 
volume : 1, 
edition: , 
page: 100–120, 
url: https://doi.org/10.1109/ICNN.2021.9483948, 
publisher: 


Idee: Nun zunächst Jahr, Volume, Seiten, Edition und URL/DOI extrahieren. Den Rest (also Titel, Publisher, Series), dann nochmal den SpacyParer drüber laufen lassen, weil 

In [16]:
s = "This Book was edited by Martin Trump" 
print(getIndexOfSubstring(s, ["edited by"]))
ner_tagger = pipeline("ner", aggregation_strategy="simple")
text = s
outputs = ner_tagger(text)
df_outputs = pd.DataFrame(outputs)
print(df_outputs)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


(14, 22, 'edited by')


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  entity_group     score          word  start  end
0         MISC  0.871492          Book      5    9
1          PER  0.999499  Martin Trump     24   36


In [40]:
#Test, um Datum Zuverlässig erkannt wird

text = "Geoffrey Hinton, Yoshua Bengio, and Yann LeCun. 2021. Deep Learning for Artificial Intelligence. In Proceedings of the IEEE International Conference on Neural Networks (Advances in Neural Information Processing), IEEE University Press, Montreal, Canada, 100–120. DOI:https://doi.org/10.1109/ICNN.2021.9483948"

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

ner_tagger = pipeline("ner", aggregation_strategy="simple")
outputs = ner_tagger(text)
df_outputs = pd.DataFrame(outputs)
print(df_outputs)
pos_pipeline = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english")
results = pos_pipeline(text)
for result in results:
    print(result)
    print("---------------------------------------------------------------------------")
    #print(f"Word: {result['word']}, POS Tag: {result['entity']}")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Geoffrey Geoffrey PROPN NNP compound Xxxxx True False
Hinton Hinton PROPN NNP ROOT Xxxxx True False
, , PUNCT , punct , False False
Yoshua Yoshua PROPN NNP compound Xxxxx True False
Bengio Bengio PROPN NNP appos Xxxxx True False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
Yann Yann PROPN NNP compound Xxxx True False
LeCun LeCun PROPN NNP conj XxXxx True False
. . PUNCT . punct . False False
2021 2021 NUM CD ROOT dddd False False
. . PUNCT . punct . False False
Deep Deep PROPN NNP compound Xxxx True False
Learning Learning PROPN NNP ROOT Xxxxx True False
for for ADP IN prep xxx True True
Artificial Artificial PROPN NNP compound Xxxxx True False
Intelligence Intelligence PROPN NNP pobj Xxxxx True False
. . PUNCT . punct . False False
In in ADP IN ROOT Xx True True
Proceedings Proceedings PROPN NNP pobj Xxxxx True False
of of ADP IN prep xx True True
the the DET DT det xxx True True
IEEE IEEE PROPN NNP compound XXXX True False
International International PROPN NNP co

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


   entity_group     score                                              word  \
0           PER  0.999674                                   Geoffrey Hinton   
1           PER  0.999051                                     Yoshua Bengio   
2           PER  0.994860                                        Yann LeCun   
3          MISC  0.763999                                          Learning   
4          MISC  0.932916                           Artificial Intelligence   
5          MISC  0.961130  IEEE International Conference on Neural Networks   
6          MISC  0.945457         Advances in Neural Information Processing   
7           ORG  0.979995                             IEEE University Press   
8           LOC  0.991279                                          Montreal   
9           LOC  0.998504                                            Canada   
10          ORG  0.763287                                              ICNN   

    start  end  
0       0   15  
1      17   30  


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'I-PER', 'score': 0.9997296, 'index': 1, 'word': 'Geoffrey', 'start': 0, 'end': 8}
---------------------------------------------------------------------------
{'entity': 'I-PER', 'score': 0.9996716, 'index': 2, 'word': 'Hi', 'start': 9, 'end': 11}
---------------------------------------------------------------------------
{'entity': 'I-PER', 'score': 0.9996216, 'index': 3, 'word': '##nton', 'start': 11, 'end': 15}
---------------------------------------------------------------------------
{'entity': 'I-PER', 'score': 0.9996699, 'index': 5, 'word': 'Yo', 'start': 17, 'end': 19}
---------------------------------------------------------------------------
{'entity': 'I-PER', 'score': 0.9979983, 'index': 6, 'word': '##shu', 'start': 19, 'end': 22}
---------------------------------------------------------------------------
{'entity': 'I-PER', 'score': 0.9995653, 'index': 7, 'word': '##a', 'start': 22, 'end': 23}
---------------------------------------------------------------------

In [None]:
doiUrl2