In [42]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AdamW, get_scheduler
from datasets import Dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import TrainingArguments, Trainer, AutoConfig

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [12]:
# importing panda library 
import pandas as pd 
import os
import fnmatch
directory = "C:\\Users\\larsl\\OneDrive\\Desktop\\DataScience\\FaPraNLP\\Testdaten\\"
#|||BibTeX
typ = ['book', 'article', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
style = ['acm', 'apa', 'mla', 'ieee', 'harv']
dfAll = pd.DataFrame(columns=['Referenzstring', 'Style', 'Literaturtyp', 'BibTeX'])
dfAll = dfAll.rename(columns={"Literaturtyp": "label"})
dfStyleAll_complemantary = pd.DataFrame(columns=['Referenzstring', 'Style', 'Literaturtyp', 'BibTeX'])
dfBuffer = pd.DataFrame()

for styleElement in style:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if fnmatch.fnmatch(filename, '*' + styleElement + '.csv'):
            dfBuffer = pd.read_csv(f, sep='|', encoding='utf-8') 
            dfBuffer = dfBuffer.rename(columns={"Literaturtyp": "label"})
            dfAll = pd.concat([dfAll, dfBuffer])
dfAll.to_csv(directory + 'trainingsdaten_all.csv', sep='|', index=False) 
print("--------------------------------------")
f = os.path.join(directory, 'trainingsdaten_all.csv')
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
print(dfAll)
print("--------------------------------------")
for item in typ:
    bufferAll = dfAll.copy()
    complemantaryList = [n for n in typ if n != item]
    print(complemantaryList)
    for complemantaryItem in complemantaryList:
        bufferAll.loc[bufferAll['label'] == complemantaryItem, 'label'] = 0
    bufferAll.loc[bufferAll['label'] == item, 'label'] = 1
    bufferAll = bufferAll.sample(frac=0.1, random_state=1)
    bufferAll.to_csv(directory + 'trainingsdaten_all_' + item + '.csv', sep='|', index=False)
    #trainArticleModel(item, dfAll)
 


--------------------------------------
                                          Referenzstring    Style      label  \
0      Shamane Siriwardhana, Rivindu Weerasekera, Ell...      acm    article   
1      Bingzhi Li, Guillaume Wisniewski, and Benoît C...      acm    article   
2      Josef Valvoda, Ryan Cotterell, and Simone Teuf...      acm    article   
3      Tom Sherborne and Mirella Lapata. 2023. Meta-l...      acm    article   
4      Zhi Chen, Yuncong Liu, Lu Chen, Su Zhu, Mengyu...      acm    article   
...                                                  ...      ...        ...   
20990  Braysher, O. 2015. "Accuracy-aware optimizatio...  harvard  phdthesis   
20991  Breed, C. 2018. "Query enumeration and nowhere...  harvard  phdthesis   
20992  Breede, C. 2019. "Visual Odometry and Sparse S...  harvard  phdthesis   
20993  Breffitt, R. 2020. "Modeling Recurring Concept...  harvard  phdthesis   
20994  Brewin, P. 2016 (October). "Minimizing Overhea...  harvard  phdthesis   



In [19]:
# Extrahiert nur den Titel
import pandas as pd 
import os
import fnmatch
import re
directory = "C:\\Users\\larsl\\OneDrive\\Desktop\\DataScience\\FaPraNLP\\Testdaten\\"

def extract_book_title(text):
    hit1 = ""
    hit2 = ""
    hit3 = ""
    match = re.search(r'title\s*=\s*["{]([^"}]*)["}],', text)
    if match:
        hit1 = match.group(1)
    else:
        print(text)
    match = re.search(r'(booktitle|journal)\s*=\s*["{]([^"}]*)["}],",', text)
    if match:
        hit2 = match.group(2)
    match = re.search(r'series\s*=\s*["{]([^"}]*)["}],",', text)
    if match:
        hit3 = match.group(1)
    return hit1 + "." + hit2


f = os.path.join(directory, "training_acm.csv")
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
dfAll = dfAll.rename(columns={"Literaturtyp": "label"})
dfAll['BibTeX'] = dfAll['BibTeX'].apply(extract_book_title)
dfAll.to_csv(directory + 'trainingsdaten_all.csv', columns=['label', 'BibTeX'], sep='|', index=False) 
print("--------------------------------------")
f = os.path.join(directory, 'trainingsdaten_all.csv')
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
print("--------------------------------------")
for item in typ:
    bufferAll = dfAll.copy()
    complemantaryList = [n for n in typ if n != item]
    print(complemantaryList)
    for complemantaryItem in complemantaryList:
        bufferAll.loc[bufferAll['label'] == complemantaryItem, 'label'] = 0
    bufferAll.loc[bufferAll['label'] == item, 'label'] = 1
    bufferAll = bufferAll.sample(frac=0.1, random_state=1)
    bufferAll.to_csv(directory + 'trainingsdaten_all_' + item + '.csv', sep='|', index=False)
    #trainArticleModel(item, dfAll)
 

@article{siriwardhana-etal-2023-improving,
    title = "Improving the Domain Adaptation of Retrieval Augmented Generation ({RAG}) Models for Open Domain Question Answering",
    author = "Siriwardhana, Shamane  and
      Weerasekera, Rivindu  and
      Wen, Elliott  and
      Kaluarachchi, Tharindu  and
      Rana, Rajib  and
      Nanayakkara, Suranga",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "11",
    year = "2023",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/2023.tacl-1.1",
    doi = "10.1162/tacl_a_00530",
    pages = "1--17",
}

@article{chen-etal-2023-opal,
    title = "{OPAL}: Ontology-Aware Pretrained Language Model for End-to-End Task-Oriented Dialogue",
    author = "Chen, Zhi  and
      Liu, Yuncong  and
      Chen, Lu  and
      Zhu, Su  and
      Wu, Mengyue  and
      Yu, Kai",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "

In [50]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    pre = precision_score(labels, preds)
    rec = recall_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "recall": rec, "precession": pre}


In [51]:
for item in typ:
    f = directory + 'trainingsdaten_all_' + item + '.csv'
    dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
    data = {
        'label': [],
        'text': []
    }

    data['label'] = dfAll['label'].tolist()
    print(data['label'])
    data['text'] = dfAll['BibTeX'].tolist()
    



    dfData = pd.DataFrame(data)
    dataset = Dataset.from_pandas(dfData)

    split_dataset = dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']

    #Die Funktion map wendet die preprocess_function auf jedes Element des Datasets an. 
    #Deine preprocess_function führt die Tokenisierung der text-Spalte durch. 
    #Da die map-Funktion das resultierende Objekt der Funktion (return der preprocess_function) zu den bestehenden Daten hinzufügt, 
    #bleiben die ursprünglichen Spalten (label und text) erhalten. 
    #Zusätzlich werden neue Spalten für die tokenisierten Daten hinzugefügt, wie z.B. input_ids und attention_mask.

    tokenized_train_data = train_dataset.map(preprocess_function, batched=True)
    tokenized_val_data = val_dataset.map(preprocess_function, batched=True)
    
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    accuracy = evaluate.load("accuracy")
    
    id2label = {0: "NON" + item, 1: item}
    label2id = {"NON" + item: 0, item: 1}
    
    config = AutoConfig.from_pretrained(model_ckpt, num_labels=2, label2id=label2id, id2label=id2label)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

    training_args = TrainingArguments(
        output_dir= item + "_recognizer",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=20,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_data,
        eval_dataset=tokenized_val_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()


[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.229508,0.814286,0.839992,1.0,0.430657
2,No log,0.227725,0.814286,0.839992,1.0,0.430657
3,No log,0.229768,0.814286,0.839992,1.0,0.430657
4,No log,0.227789,0.814286,0.839992,1.0,0.430657
5,0.224700,0.230326,0.814286,0.839992,1.0,0.430657
6,0.224700,0.229713,0.814286,0.839992,1.0,0.430657
7,0.224700,0.232529,0.814286,0.839992,1.0,0.430657
8,0.224700,0.23111,0.814286,0.839992,1.0,0.430657
9,0.224700,0.232665,0.814286,0.839992,1.0,0.430657
10,0.216300,0.231743,0.814286,0.839992,1.0,0.430657


[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.006545,0.997619,0.997612,0.985915,1.0
2,No log,0.000694,1.0,1.0,1.0,1.0
3,No log,0.000334,1.0,1.0,1.0,1.0
4,No log,0.000202,1.0,1.0,1.0,1.0
5,0.038000,0.000138,1.0,1.0,1.0,1.0
6,0.038000,0.000101,1.0,1.0,1.0,1.0
7,0.038000,7.7e-05,1.0,1.0,1.0,1.0
8,0.038000,6.2e-05,1.0,1.0,1.0,1.0
9,0.038000,5e-05,1.0,1.0,1.0,1.0
10,0.000100,4.1e-05,1.0,1.0,1.0,1.0


[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.001918,1.0,1.0,1.0,1.0
2,No log,0.000597,1.0,1.0,1.0,1.0
3,No log,0.000305,1.0,1.0,1.0,1.0
4,No log,0.000189,1.0,1.0,1.0,1.0
5,0.023800,0.00013,1.0,1.0,1.0,1.0
6,0.023800,9.6e-05,1.0,1.0,1.0,1.0
7,0.023800,7.5e-05,1.0,1.0,1.0,1.0
8,0.023800,6e-05,1.0,1.0,1.0,1.0
9,0.023800,5e-05,1.0,1.0,1.0,1.0
10,0.000100,4.2e-05,1.0,1.0,1.0,1.0


[0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.002366,1.0,1.0,1.0,1.0
2,No log,0.000699,1.0,1.0,1.0,1.0
3,No log,0.000329,1.0,1.0,1.0,1.0
4,No log,0.000197,1.0,1.0,1.0,1.0
5,0.028600,0.000135,1.0,1.0,1.0,1.0
6,0.028600,0.000103,1.0,1.0,1.0,1.0
7,0.028600,8.1e-05,1.0,1.0,1.0,1.0
8,0.028600,6.3e-05,1.0,1.0,1.0,1.0
9,0.028600,5.2e-05,1.0,1.0,1.0,1.0
10,0.001100,4.4e-05,1.0,1.0,1.0,1.0


[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.00257,1.0,1.0,1.0,1.0
2,No log,0.000737,1.0,1.0,1.0,1.0
3,No log,0.000377,1.0,1.0,1.0,1.0
4,No log,0.000238,1.0,1.0,1.0,1.0
5,0.026200,0.000166,1.0,1.0,1.0,1.0
6,0.026200,0.000122,1.0,1.0,1.0,1.0
7,0.026200,9.4e-05,1.0,1.0,1.0,1.0
8,0.026200,7.5e-05,1.0,1.0,1.0,1.0
9,0.026200,6.2e-05,1.0,1.0,1.0,1.0
10,0.000200,5.2e-05,1.0,1.0,1.0,1.0


[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.230531,0.814286,0.730934,0.0,0.0
2,No log,0.226579,0.814286,0.730934,0.0,0.0
3,No log,0.229382,0.814286,0.730934,0.0,0.0
4,No log,0.228879,0.814286,0.730934,0.0,0.0
5,0.224600,0.229233,0.814286,0.730934,0.0,0.0
6,0.224600,0.229089,0.814286,0.730934,0.0,0.0
7,0.224600,0.232098,0.814286,0.730934,0.0,0.0
8,0.224600,0.230687,0.814286,0.730934,0.0,0.0
9,0.224600,0.232365,0.814286,0.730934,0.0,0.0
10,0.215300,0.231238,0.814286,0.730934,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [1]:
from transformers import pipeline
text="""Hallo"""
classifier = pipeline("text-classification", model="LaLaf93/phdthesis_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/inproceedings_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/book_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/incollection_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/article_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/phdthesis_recognizer")
print(classifier(text))

config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

[{'label': 'NONphdthesis', 'score': 0.5505961775779724}]
[{'label': 'NONinproceedings', 'score': 0.9999709129333496}]
[{'label': 'book', 'score': 0.5221682786941528}]
[{'label': 'NONincollection', 'score': 0.9992231130599976}]
[{'label': 'NONarticle', 'score': 0.9999327659606934}]
[{'label': 'NONphdthesis', 'score': 0.5505961775779724}]


In [7]:
# Beispielwerte für die Felder
author = "Lars"
title = "Meine Dissertation"
school = "Beispieluniversität"
year = "2024"
address = "Beispielstadt"
month = "Juli"
note = "Dies ist eine Anmerkung"
key = "mein_key"
type_field = "PhD"  # 'type' umbenannt in 'type_field' wegen reserviertem Wort in Python
doi = "10.1234/beispiel.doi"
url = "http://beispielurl.de"

# Gegebene Felder
bookFields = ["author", "title", "publisher", "year", "volume", "number", 
              "series", "address", "edition", "month", "note", "key", "editor", 
              "howpublished", "organization", "chapter", "pages", "isbn", "url"]

inproceedingsFields = ["author", "title", "booktitle", "year", "editor", "volume", 
                       "number", "series", "pages", "address", "month", "organization", 
                       "publisher", "note", "key", "doi", "url"]

proceedingsFields = ["title", "year", "editor", "volume", "number", "series", 
                     "address", "month", "organization", "publisher", "note", "key", "doi", "url"]

incollectionFields = ["author", "title", "booktitle", "publisher", "year", "editor", 
                      "volume", "number", "series", "type_field", "chapter", "pages", "address", 
                      "edition", "month", "note", "key", "doi", "url"]

articleFields = ["author", "title", "journal", "year", "volume", "number", 
                 "pages", "month", "note", "key", "doi", "url"]

phdthesisFields = ["author", "title", "school", "year", "address", "month", 
                   "note", "key", "type_field", "doi", "url"]

# Beispiel Literaturtyp
literatureType = "phdthesis"

# BibTeX Initialisierung
bibTex = "@"

if literatureType == "book":
    bibTex += "book{,\n"
    fields = bookFields
elif literatureType == "proceedings":
    bibTex += "proceedings{,\n"
    fields = proceedingsFields
elif literatureType == "inproceedings":
    bibTex += "inproceedings{,\n"
    fields = inproceedingsFields
elif literatureType == "incollection":
    bibTex += "incollection{,\n"
    fields = incollectionFields
elif literatureType == "article":
    bibTex += "article{,\n"
    fields = articleFields
else:
    bibTex += "phdthesis{,\n"
    fields = phdthesisFields

# Felder hinzufügen
for field in fields:
    value = eval(field) if field != 'type' else eval('type_field')
    bibTex += f'    {field}={{{value}}},\n'

# Schließe die BibTeX-Eintragung
bibTex = bibTex.rstrip(',\n') + "\n}"

# Ausgabe des Ergebnisses
print(bibTex)


@phdthesis{,
    author={Lars},
    title={Meine Dissertation},
    school={Beispieluniversität},
    year={2024},
    address={Beispielstadt},
    month={Juli},
    note={Dies ist eine Anmerkung},
    key={mein_key},
    type_field={PhD},
    doi={10.1234/beispiel.doi},
    url={http://beispielurl.de}
}


In [10]:
fields = ["author", "title", "school", "year", "address", "month", "note", "key", "type_field", "doi", "url"]
values = ["Lars", "Meine Dissertation", "Beispieluniversität", "2024", "Beispielstadt", "Juli", "Dies ist eine Anmerkung", "mein_key", "PhD", "10.1234/beispiel.doi", "http://beispielurl.de"]

zippedFields_values = zip(fields, values)
zippedList = list(zipped_fields_values)
bibTex += "book{,"
for field in bookFields:
    bibTex += f'{field[0]}={{{field[1]}}},\n' 

# Ausgabe der gezippten Liste
print(bibTex)


@phdthesis{,
    author={Lars},
    title={Meine Dissertation},
    school={Beispieluniversität},
    year={2024},
    address={Beispielstadt},
    month={Juli},
    note={Dies ist eine Anmerkung},
    key={mein_key},
    type_field={PhD},
    doi={10.1234/beispiel.doi},
    url={http://beispielurl.de}
}book{,a={u},
t={i},
p={u},
y={e},
v={o},
n={u},
s={e},
a={d},
e={d},
m={o},
n={o},
k={e},
e={d},
h={o},
o={r},
c={h},
p={a},
i={s},
u={r},
book{,a={u},
t={i},
p={u},
y={e},
v={o},
n={u},
s={e},
a={d},
e={d},
m={o},
n={o},
k={e},
e={d},
h={o},
o={r},
c={h},
p={a},
i={s},
u={r},

