In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AdamW, get_scheduler
from datasets import Dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import TrainingArguments, Trainer, AutoConfig

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


True


In [5]:
import pandas as pd 
import os
import string
import fnmatch
import re
directory = "..."

'''
Erzeugt einen Trainingsdatensatz, der aus Booktitel und Titel besteht.
'''

def getIndexOfSubstring(text, regEx = "", start = 0):
   
    '''
    Prüft für einen RegEx, ob sie in text vorkommen. 
    
    Parameter:
    text: Text, wo das Auftreten des RegEx geprüft wird.
    regEx = RegEx. 
    start: Index, ab dem in text gesucht werden soll.
    
    return: Substring und den zugehörigen Start- und Endindex.
    '''
    
    length = 0
    matches = []
    substring = ""
    text = text[start:len(text)]
    match = re.search(regEx, text)
    if match:
        startIndex = match.start()
        endIndex = match.end()
        substring = text[match.start():match.end()]
        return startIndex + start, endIndex + start, substring 
    return -1, -1, substring

def replaceSubstring (startIndex, endIndex, text, substituteString):
       
    '''
    Ersetzt in dem String namens text einen Substring durch einen anderen String namens substituteString.
    
    Parameter:
    startIndex: Index, wo der zu ersetztende Substring im String namens text eingefügt werden soll.
    endIndex: Index, wo der zu ersetztende Substring im String text enden soll.
    text: Text, wo das Auftreten des Substrings geprüft wird.
    substituteString: Der einzufügende Substring.

    
    return: Substring und den zugehörigen Start- und Endindex.
    '''
    
    changedText = text[0:startIndex] + substituteString + text[endIndex:len(text)]
    return changedText, text[startIndex:endIndex]
    return text, ""

def custom_strip(text):
        
    '''
    Strip-Funktion, die standardmäßig neben Whitespace auch zeichen aus string.punctuation entfernt.
    
    Parameter:
    text: Text, der gestripped werden soll.
    
    return: gestrippter Text.
    '''
    
    allowed_chars = string.punctuation + string.whitespace
    return text.strip(allowed_chars)

def extract_book_title(text):
            
    '''
    Extrhiert Booktitel und den Titel.
    
    Parameter:
    text: Text, aus dem Booktitel und den Titel extrahiert werden soll.
    
    return: Booktitel und den Titel.
    '''
    
    startIndex1, endIndex1, buffer = getIndexOfSubstring(text, '(booktitle = \s*|journal = \s*)')
    startIndex2, endIndex2, buffer = getIndexOfSubstring(text, ',\n', start = endIndex1)
    changedText, substring1 = replaceSubstring(endIndex1, startIndex2, text, '')
    substring1 = substring1.replace('\n', '')
    startIndex1, endIndex1, buffer = getIndexOfSubstring(changedText, '(title = \s*)')
    startIndex2, endIndex2, buffer = getIndexOfSubstring(changedText, ',\n', start = endIndex1)
    changedText, substring2 = replaceSubstring(endIndex1, startIndex2, changedText, '')
    substring2 = substring2.replace('\n', '')
    return custom_strip(substring2) + '.' + custom_strip(substring1)

style = ['acm', 'apa', 'mla', 'ieee', 'harv']
typ = ['book', 'article', 'proceedings', 'inproceedings', 'incollection', 'phdthesis']
label = -1
filenames = []
rowParts = []
for style in style:
    f = os.path.join(directory, "training_" + style + ".csv")
    dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
    dfAll = dfAll.rename(columns={"Literaturtyp": "label"})
    dfAll['BibTeX'] = dfAll['BibTeX'].apply(extract_book_title)
    for item in typ:
        label = label + 1
        dfAll.loc[dfAll['label'] == item, 'label'] = label
    filenames.append(directory + style + '_labeled.csv')
    dfAll.to_csv(directory + style + '_labeled.csv', columns=['label', 'BibTeX'], sep='|', index=False)
    label = -1

combined_df = pd.DataFrame()
offset = 700
for i, f in enumerate(filenames):
    df = pd.read_csv(f, sep='|', encoding='utf-8')
    rowParts.append(df.iloc[i*100:i*100+101])
    rowParts.append(df.iloc[offset+i*100:offset+i*100+101])
    rowParts.append(df.iloc[2*offset+i*100:2*offset+i*100+101])
    rowParts.append(df.iloc[3*offset+i*100:3*offset+i*100+101])
    rowParts.append(df.iloc[4*offset+i*100:4*offset+i*100+101])
    rowParts.append(df.iloc[5*offset+i*100:5*offset+i*100+101])
combined_df = pd.concat(rowParts, ignore_index=True)
combined_df.to_csv(directory + 'traindataAll_labeled.csv', sep='|', index=False)

 

In [6]:
def preprocess_function(examples):
    
    '''
    Führt eine Tokenizierung der Trainings- und Validierungsdaten durch
    
    Parameter:
    examples: Dataset mit Strings
    
    return: Dictionary mit input_ids und attention_mask
    '''
    
    print(tokenizer(examples["text"], padding="max_length", truncation=True))   
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(pred):
    
    '''
    Berechnet die Metriken Accuracy, Precession, Recall und f1-Score.
    
    Parameter:
    pred: Vorehrsage des Modells.
    
    return: Dictionary mit Accuracy, Precession, Recall und f1-Score.
    '''
    
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    pre = precision_score(labels, preds, average="weighted")
    rec = recall_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1, "recall": rec, "precession": pre}


In [7]:
f = directory + 'traindataAll_labeled.csv'
dfAll = pd.read_csv(f, sep='|', encoding='utf-8') 
data = {
    'label': [],
    'text': []
}

data['label'] = dfAll['label'].tolist()
data['text'] = dfAll['BibTeX'].tolist()

dfData = pd.DataFrame(data)
dataset = Dataset.from_pandas(dfData)

split_dataset = dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

tokenized_train_data = train_dataset.map(preprocess_function, batched=True)
tokenized_val_data = val_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
id2label = {0: 'book', 1: 'article', 2: 'proceedings', 3: 'inproceedings', 4: 'incollection', 5: 'phdthesis'}
label2id = {'book': 0, 'article': 1, 'proceedings': 2, 'inproceedings': 3, 'incollection': 4, 'phdthesis': 5}

config = AutoConfig.from_pretrained(model_ckpt, num_labels=6, id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

training_args = TrainingArguments(
    output_dir= "LiteratureTyp_recognizer",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 

Map:   0%|          | 0/2424 [00:00<?, ? examples/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'input_ids': [[101, 1996, 10867, 7716, 18279, 4328, 2243, 1999, 4315, 18178, 9856, 6151, 6887, 7274, 5480, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Map:   0%|          | 0/606 [00:00<?, ? examples/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}




DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "book",
    "1": "article",
    "2": "proceedings",
    "3": "inproceedings",
    "4": "incollection",
    "5": "phdthesis"
  },
  "initializer_range": 0.02,
  "label2id": {
    "article": 1,
    "book": 0,
    "incollection": 4,
    "inproceedings": 3,
    "phdthesis": 5,
    "proceedings": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.41.0",
  "vocab_size": 30522
}



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
