In [11]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AdamW, get_scheduler
from datasets import Dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import TrainingArguments, Trainer, AutoConfig

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


True


In [13]:
# Extrahiert nur den Titel
import pandas as pd 
import os
import string
import fnmatch
import re
directory = "C:\\Users\\larsl\\OneDrive\\Desktop\\DataScience\\FaPraNLP\\Testdaten\\"

def getIndexOfSubstring(text, regEx = "", reverse = False, start = 0):
    #if reverse = False then it finds the first occurance of a given regEx.
    #if reverse = True, then it finds the last occurance of a given regEx.
    #beceause the occurance with the max length is taken, it always chooses the regex that covers the most letters
    length = 0
    matches = []
    substring = ""
    text = text[start:len(text)]
    match = re.search(regEx, text)
    if match:
        startIndex = match.start()
        endIndex = match.end()
        substring = text[match.start():match.end()]
        return startIndex + start, endIndex + start, substring 
    return -1, -1, substring

def replaceSubstring (startIndex, endIndex, text, substituteString, ignorePunctuation = ["&", "(", ")"]):
    changedText = text[0:startIndex] + substituteString + text[endIndex:len(text)]
    return changedText, text[startIndex:endIndex]
    return text, ""


def custom_strip(text, replaceCharacter = []):
    allowed_chars = string.punctuation + string.whitespace
    return text.strip(allowed_chars)

def extract_book_title(text):
    startIndex1, endIndex1, buffer = getIndexOfSubstring(text, '(booktitle = \s*|journal = \s*)')
    startIndex2, endIndex2, buffer = getIndexOfSubstring(text, ',\n', start = endIndex1)
    changedText, substring1 = replaceSubstring(endIndex1, startIndex2, text, '')
    substring1 = substring1.replace('\n', '')
    startIndex1, endIndex1, buffer = getIndexOfSubstring(changedText, '(title = \s*)')
    startIndex2, endIndex2, buffer = getIndexOfSubstring(changedText, ',\n', start = endIndex1)
    changedText, substring2 = replaceSubstring(endIndex1, startIndex2, changedText, '')
    substring2 = substring2.replace('\n', '')
    #startIndex1, endIndex1, buffer = getIndexOfSubstring(changedText, '(series = \s*)')
    #startIndex2, endIndex2, buffer = getIndexOfSubstring(changedText, ',\n', start = endIndex1)
    #changedText, substring2 = replaceSubstring(endIndex1, startIndex2, changedText, '')
    return custom_strip(substring2) + '.' + custom_strip(substring1)

style = ['acm', 'apa', 'mla', 'ieee', 'harv']
typ = ['book', 'article', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
label = -1
filenames = []
rowParts = []
for style in style:
    f = os.path.join(directory, "training_" + style + ".csv")
    dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
    dfAll = dfAll.rename(columns={"Literaturtyp": "label"})
    dfAll['BibTeX'] = dfAll['BibTeX'].apply(extract_book_title)
    for item in typ:
        label = label + 1
        dfAll.loc[dfAll['label'] == item, 'label'] = label
#bufferAll = bufferAll.sample(random_state=1)
    filenames.append(directory + style + '_labeled.csv')
    dfAll.to_csv(directory + style + '_labeled.csv', columns=['label', 'BibTeX'], sep='|', index=False)
    label = -1

combined_df = pd.DataFrame()
offset = 700
for i, f in enumerate(filenames):
    df = pd.read_csv(f, sep='|', encoding='utf-8')
    rowParts.append(df.iloc[i*100:i*100+101])
    rowParts.append(df.iloc[offset+i*100:offset+i*100+101])
    rowParts.append(df.iloc[2*offset+i*100:2*offset+i*100+101])
    rowParts.append(df.iloc[3*offset+i*100:3*offset+i*100+101])
    rowParts.append(df.iloc[4*offset+i*100:4*offset+i*100+101])
    rowParts.append(df.iloc[5*offset+i*100:5*offset+i*100+101])
combined_df = pd.concat(rowParts, ignore_index=True)
combined_df.to_csv(directory + 'traindataAll_labeled.csv', sep='|', index=False)

 

In [14]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    pre = precision_score(labels, preds, average="weighted")
    rec = recall_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1, "recall": rec, "precession": pre}


In [15]:

f = directory + 'traindataAll_labeled.csv'
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
data = {
    'label': [],
    'text': []
}

data['label'] = dfAll['label'].tolist()
print(data['label'])
data['text'] = dfAll['BibTeX'].tolist()




dfData = pd.DataFrame(data)
dataset = Dataset.from_pandas(dfData)

split_dataset = dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

#Die Funktion map wendet die preprocess_function auf jedes Element des Datasets an. 
#Deine preprocess_function führt die Tokenisierung der text-Spalte durch. 
#Da die map-Funktion das resultierende Objekt der Funktion (return der preprocess_function) zu den bestehenden Daten hinzufügt, 
#bleiben die ursprünglichen Spalten (label und text) erhalten. 
#Zusätzlich werden neue Spalten für die tokenisierten Daten hinzugefügt, wie z.B. input_ids und attention_mask.

tokenized_train_data = train_dataset.map(preprocess_function, batched=True)
tokenized_val_data = val_dataset.map(preprocess_function, batched=True)

print(tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
id2label = {0: 'book', 1: 'article', 2: 'proceedings', 3: 'inproceedings', 4: 'incollection', 5: 'phdthesis'}
label2id = {'book': 0, 'article': 1, 'proceedings': 2, 'inproceedings': 3, 'incollection': 4, 'phdthesis': 5}

config = AutoConfig.from_pretrained(model_ckpt, num_labels=6, id2label=id2label, label2id=label2id)
print(config)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

training_args = TrainingArguments(
    output_dir= "LiteratureTyp_recognizer",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 

Map:   0%|          | 0/2424 [00:00<?, ? examples/s]

Map:   0%|          | 0/606 [00:00<?, ? examples/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}




DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "book",
    "1": "article",
    "2": "proceedings",
    "3": "inproceedings",
    "4": "incollection",
    "5": "phdthesis"
  },
  "initializer_range": 0.02,
  "label2id": {
    "article": 1,
    "book": 0,
    "incollection": 4,
    "inproceedings": 3,
    "phdthesis": 5,
    "proceedings": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.41.0",
  "vocab_size": 30522
}



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.067733,0.991749,0.991789,0.991749,0.992135
2,No log,0.046565,0.990099,0.99009,0.990099,0.990138
3,No log,0.034218,0.993399,0.993399,0.993399,0.993399
4,0.203300,0.038746,0.990099,0.99009,0.990099,0.990138
5,0.203300,0.033535,0.993399,0.993399,0.993399,0.993399


TrainOutput(global_step=760, training_loss=0.1356656041584517, metrics={'train_runtime': 1409.004, 'train_samples_per_second': 8.602, 'train_steps_per_second': 0.539, 'total_flos': 1605619399311360.0, 'train_loss': 0.1356656041584517, 'epoch': 5.0})

In [16]:
from transformers import pipeline
text="""The Role of Balanced Training and Testing Data Sets for Binary Classifiers in Bioinformatics. PLOS ONE."""
classifier = pipeline("text-classification", model="LaLaf93/LiteratureTyp_recognizer")
print(classifier(text))

config.json:   0%|          | 0.00/952 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

[{'label': 'incollection', 'score': 0.9718014001846313}]
