In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AdamW, get_scheduler
from datasets import Dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import TrainingArguments, Trainer, AutoConfig

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [12]:
# importing panda library 
import pandas as pd 
import os
import fnmatch
directory = "C:\\Users\\larsl\\OneDrive\\Desktop\\DataScience\\FaPraNLP\\Testdaten\\"
#|||BibTeX
typ = ['book', 'article', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
style = ['acm', 'apa', 'mla', 'ieee', 'harv']
dfAll = pd.DataFrame(columns=['Referenzstring', 'Style', 'Literaturtyp', 'BibTeX'])
dfAll = dfAll.rename(columns={"Literaturtyp": "label"})
dfStyleAll_complemantary = pd.DataFrame(columns=['Referenzstring', 'Style', 'Literaturtyp', 'BibTeX'])
dfBuffer = pd.DataFrame()

for styleElement in style:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if fnmatch.fnmatch(filename, '*' + styleElement + '.csv'):
            dfBuffer = pd.read_csv(f, sep='|', encoding='utf-8') 
            dfBuffer = dfBuffer.rename(columns={"Literaturtyp": "label"})
            dfAll = pd.concat([dfAll, dfBuffer])
dfAll.to_csv(directory + 'trainingsdaten_all.csv', sep='|', index=False) 
print("--------------------------------------")
f = os.path.join(directory, 'trainingsdaten_all.csv')
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
print(dfAll)
print("--------------------------------------")
for item in typ:
    bufferAll = dfAll.copy()
    complemantaryList = [n for n in typ if n != item]
    print(complemantaryList)
    for complemantaryItem in complemantaryList:
        bufferAll.loc[bufferAll['label'] == complemantaryItem, 'label'] = 0
    bufferAll.loc[bufferAll['label'] == item, 'label'] = 1
    bufferAll = bufferAll.sample(frac=0.1, random_state=1)
    bufferAll.to_csv(directory + 'trainingsdaten_all_' + item + '.csv', sep='|', index=False)
    #trainArticleModel(item, dfAll)
 


--------------------------------------
                                          Referenzstring    Style      label  \
0      Shamane Siriwardhana, Rivindu Weerasekera, Ell...      acm    article   
1      Bingzhi Li, Guillaume Wisniewski, and Benoît C...      acm    article   
2      Josef Valvoda, Ryan Cotterell, and Simone Teuf...      acm    article   
3      Tom Sherborne and Mirella Lapata. 2023. Meta-l...      acm    article   
4      Zhi Chen, Yuncong Liu, Lu Chen, Su Zhu, Mengyu...      acm    article   
...                                                  ...      ...        ...   
20990  Braysher, O. 2015. "Accuracy-aware optimizatio...  harvard  phdthesis   
20991  Breed, C. 2018. "Query enumeration and nowhere...  harvard  phdthesis   
20992  Breede, C. 2019. "Visual Odometry and Sparse S...  harvard  phdthesis   
20993  Breffitt, R. 2020. "Modeling Recurring Concept...  harvard  phdthesis   
20994  Brewin, P. 2016 (October). "Minimizing Overhea...  harvard  phdthesis   



In [37]:
# Extrahiert nur den Titel
import pandas as pd 
import os
import string
import fnmatch
import re
directory = "C:\\Users\\larsl\\OneDrive\\Desktop\\DataScience\\FaPraNLP\\Testdaten\\"

def getIndexOfSubstring(text, regEx = "", reverse = False, start = 0):
    #if reverse = False then it finds the first occurance of a given regEx.
    #if reverse = True, then it finds the last occurance of a given regEx.
    #beceause the occurance with the max length is taken, it always chooses the regex that covers the most letters
    length = 0
    matches = []
    substring = ""
    text = text[start:len(text)]
    match = re.search(regEx, text)
    if match:
        startIndex = match.start()
        endIndex = match.end()
        substring = text[match.start():match.end()]
        return startIndex + start, endIndex + start, substring 
    return -1, -1, substring

def replaceSubstring (startIndex, endIndex, text, substituteString, ignorePunctuation = ["&", "(", ")"]):
    changedText = text[0:startIndex] + substituteString + text[endIndex:len(text)]
    return changedText, text[startIndex:endIndex]
    return text, ""


def custom_strip(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace
    return text.strip(allowed_chars)

def extract_book_title(text):
    startIndex1, endIndex1, buffer = getIndexOfSubstring(text, '(booktitle = \s*|journal = \s*)')
    startIndex2, endIndex2, buffer = getIndexOfSubstring(text, ',\n', start = endIndex1)
    changedText, substring1 = replaceSubstring(endIndex1, startIndex2, text, '')
    substring1 = substring1.replace('\n', '')
    startIndex1, endIndex1, buffer = getIndexOfSubstring(changedText, '(title = \s*)')
    startIndex2, endIndex2, buffer = getIndexOfSubstring(changedText, ',\n', start = endIndex1)
    changedText, substring2 = replaceSubstring(endIndex1, startIndex2, changedText, '')
    substring2 = substring2.replace('\n', '')
    #startIndex1, endIndex1, buffer = getIndexOfSubstring(changedText, '(series = \s*)')
    #startIndex2, endIndex2, buffer = getIndexOfSubstring(changedText, ',\n', start = endIndex1)
    #changedText, substring2 = replaceSubstring(endIndex1, startIndex2, changedText, '')
    return custom_strip(substring2) + '.' + custom_strip(substring1)

typ = ['book', 'article', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
f = os.path.join(directory, "training_acm.csv")
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
dfAll = dfAll.rename(columns={"Literaturtyp": "label"})
dfAll['BibTeX'] = dfAll['BibTeX'].apply(extract_book_title)
dfAll.to_csv(directory + 'trainingsdaten_all.csv', columns=['label', 'BibTeX'], sep='|', index=False) 
print("--------------------------------------")
f = os.path.join(directory, 'trainingsdaten_all.csv')
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
print("--------------------------------------")
for item in typ:
    bufferAll = dfAll.copy()
    complemantaryList = [n for n in typ if n != item]
    print(complemantaryList)
    for complemantaryItem in complemantaryList:
        bufferAll.loc[bufferAll['label'] == complemantaryItem, 'label'] = 0
    bufferAll.loc[bufferAll['label'] == item, 'label'] = 1
    #bufferAll = bufferAll.sample(frac=0.5, random_state=1)
    bufferAll.to_csv(directory + 'trainingsdaten_all_' + item + '.csv', sep='|', index=False)
    #trainArticleModel(item, dfAll)
 

--------------------------------------
--------------------------------------
['article', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
['book', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
['book', 'article', 'inproceedings', 'incollection', 'phdthesis']
['book', 'article', 'proceedings', 'incollection', 'phdthesis']
['book', 'article', 'proceedings', 'inproceedings', 'phdthesis']
['book', 'article', 'proceedings', 'inproceedings', 'incollection']


In [32]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    pre = precision_score(labels, preds)
    rec = recall_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "recall": rec, "precession": pre}


In [33]:
for item in typ:
    f = directory + 'trainingsdaten_all_' + item + '.csv'
    dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
    data = {
        'label': [],
        'text': []
    }

    data['label'] = dfAll['label'].tolist()
    print(data['label'])
    data['text'] = dfAll['BibTeX'].tolist()
    



    dfData = pd.DataFrame(data)
    dataset = Dataset.from_pandas(dfData)

    split_dataset = dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']

    #Die Funktion map wendet die preprocess_function auf jedes Element des Datasets an. 
    #Deine preprocess_function führt die Tokenisierung der text-Spalte durch. 
    #Da die map-Funktion das resultierende Objekt der Funktion (return der preprocess_function) zu den bestehenden Daten hinzufügt, 
    #bleiben die ursprünglichen Spalten (label und text) erhalten. 
    #Zusätzlich werden neue Spalten für die tokenisierten Daten hinzugefügt, wie z.B. input_ids und attention_mask.

    tokenized_train_data = train_dataset.map(preprocess_function, batched=True)
    tokenized_val_data = val_dataset.map(preprocess_function, batched=True)
    
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    accuracy = evaluate.load("accuracy")
    
    id2label = {0: "NON" + item, 1: item}
    label2id = {"NON" + item: 0, item: 1}
    
    config = AutoConfig.from_pretrained(model_ckpt, num_labels=2, label2id=label2id, id2label=id2label)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

    training_args = TrainingArguments(
        output_dir= item + "_recognizer",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=20,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_data,
        eval_dataset=tokenized_val_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.029219,0.994048,0.994022,0.972028,0.992857
2,No log,0.028374,0.994048,0.994022,0.972028,0.992857
3,0.053800,0.028304,0.994048,0.994022,0.972028,0.992857
4,0.053800,0.021536,0.995238,0.995211,0.972028,1.0
5,0.004000,0.024506,0.995238,0.995264,1.0,0.972789
6,0.004000,0.038655,0.992857,0.992915,1.0,0.959732
7,0.004000,0.027427,0.995238,0.995264,1.0,0.972789
8,0.001300,0.023284,0.995238,0.995264,1.0,0.972789
9,0.001300,0.020509,0.995238,0.995264,1.0,0.972789
10,0.000100,0.01562,0.995238,0.995264,1.0,0.972789


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.012165,0.997619,0.997626,1.0,0.985915
2,No log,0.008713,0.99881,0.998811,1.0,0.992908
3,0.035100,0.006353,0.99881,0.998811,1.0,0.992908
4,0.035100,8.7e-05,1.0,1.0,1.0,1.0
5,0.000700,0.000696,1.0,1.0,1.0,1.0
6,0.000700,0.000973,0.99881,0.998811,1.0,0.992908
7,0.000700,0.00045,1.0,1.0,1.0,1.0
8,0.000000,0.000285,1.0,1.0,1.0,1.0
9,0.000000,0.000225,1.0,1.0,1.0,1.0
10,0.000000,0.00017,1.0,1.0,1.0,1.0


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.002096,0.99881,0.998808,0.992647,1.0
2,No log,0.003477,0.99881,0.998811,1.0,0.992701
3,0.028500,0.006169,0.99881,0.998808,0.992647,1.0
4,0.028500,0.004851,0.99881,0.998808,0.992647,1.0
5,0.000200,0.003922,0.99881,0.998808,0.992647,1.0
6,0.000200,0.003477,0.99881,0.998808,0.992647,1.0
7,0.000200,0.003113,0.99881,0.998808,0.992647,1.0
8,0.000100,0.002863,0.99881,0.998808,0.992647,1.0
9,0.000100,0.002508,0.99881,0.998808,0.992647,1.0
10,0.000000,0.002341,0.99881,0.998808,0.992647,1.0


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.007277,0.99881,0.998811,1.0,0.992647
2,No log,0.008249,0.99881,0.998811,1.0,0.992647
3,0.032000,0.009005,0.99881,0.998811,1.0,0.992647
4,0.032000,0.0096,0.99881,0.998811,1.0,0.992647
5,0.000100,0.010074,0.99881,0.998811,1.0,0.992647
6,0.000100,0.010446,0.99881,0.998811,1.0,0.992647
7,0.000100,0.010774,0.99881,0.998811,1.0,0.992647
8,0.000000,0.011024,0.99881,0.998811,1.0,0.992647
9,0.000000,0.011272,0.99881,0.998811,1.0,0.992647
10,0.000000,0.011493,0.99881,0.998811,1.0,0.992647


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.00994,0.997619,0.997612,0.985714,1.0
2,No log,0.020026,0.997619,0.997612,0.985714,1.0
3,0.031700,0.015433,0.997619,0.997612,0.985714,1.0
4,0.031700,0.015909,0.997619,0.997612,0.985714,1.0
5,0.000500,0.016417,0.997619,0.997612,0.985714,1.0
6,0.000500,0.017598,0.997619,0.997612,0.985714,1.0
7,0.000500,0.01797,0.997619,0.997612,0.985714,1.0
8,0.000000,0.018513,0.997619,0.997612,0.985714,1.0
9,0.000000,0.018821,0.997619,0.997612,0.985714,1.0
10,0.000000,0.019148,0.997619,0.997612,0.985714,1.0


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.048841,0.985714,0.98579,0.972603,0.946667
2,No log,0.028981,0.992857,0.992857,0.979452,0.979452
3,0.055200,0.038513,0.990476,0.990576,1.0,0.948052
4,0.055200,0.043064,0.989286,0.989271,0.965753,0.972414
5,0.006100,0.054255,0.989286,0.989411,1.0,0.941935
6,0.006100,0.021305,0.995238,0.995263,1.0,0.973333
7,0.006100,0.055029,0.988095,0.987927,0.931507,1.0
8,0.001100,0.035107,0.994048,0.994087,1.0,0.966887
9,0.001100,0.034544,0.994048,0.994087,1.0,0.966887
10,0.000100,0.03514,0.994048,0.994087,1.0,0.966887


'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/7a/fe/7afef9c905a11147949b36948f84f76b03b61c0b87526ff80645928a749179dd/168e2fc05ea218221d992386f282b0cdf1f6b3ab8c66653595cf446aed86c6e9?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20240705%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240705T151017Z&X-Amz-Expires=86400&X-Amz-Signature=7299d072985400e11a79f81bb8d509c8f2b87484498a9b1f9b934ca9c5f8ba4c&X-Amz-SignedHeaders=host&partNumber=13&uploadId=S.1A7lU4wVNJpgbomWCtPu70Sok_GiCF7WgLrfoF5CdHxjr9QWWbmEqL1e8ttCrHcOaFdsjoTjGDzkWUukAeIzRVwIqXHR8xAmDkosWfqfb6Bu75ODcop5a1nXdUPi1u&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2427)')))"), '(Request ID: cb7fbf44-6cf7-4043-8eb1-0d26ddb6d1c1)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/7a/fe/

In [36]:
from transformers import pipeline
text="""An Intelligent Network Video Chat System Based on VNN Platform.2020 International Conference on Computer Network"""
classifier = pipeline("text-classification", model="LaLaf93/phdthesis_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/inproceedings_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/book_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/incollection_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/article_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="LaLaf93/phdthesis_recognizer")
print(classifier(text))

[{'label': 'NONphdthesis', 'score': 0.9999929666519165}]
[{'label': 'NONinproceedings', 'score': 0.9999983310699463}]
[{'label': 'NONbook', 'score': 0.9999891519546509}]
[{'label': 'incollection', 'score': 0.9999759197235107}]
[{'label': 'NONarticle', 'score': 0.9999972581863403}]
[{'label': 'NONphdthesis', 'score': 0.9999929666519165}]
