In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset


In [2]:
dfBibTex = pd.read_csv('bibTex.csv', encoding = "ansi", on_bad_lines='skip', sep=';')

In [3]:
data = {
    'input_text': [],
    'target_text': []
}

data['input_text'] = dfBibTex['Referenzstring'].tolist()
data['target_text'] = dfBibTex['BibTeX'].tolist()

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)


split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

                                            input_text  \
671  @inproceedings{ahsan-etal-2024-multimodal,\n  ...   
286  @inproceedings{shah-etal-2024-parrottts,\n    ...   
728  @inproceedings{k-etal-2024-dataset,\n    title...   
434  @inproceedings{fily-etal-2024-establishing,\n ...   
734  @inproceedings{b-etal-2024-findings-shared,\n ...   
..                                                 ...   
536  @inproceedings{koto-etal-2024-zero,\n    title...   
346  @inproceedings{chen-etal-2024-learning,\n    t...   
247  @inproceedings{ohman-etal-2024-emotionarcs,\n ...   
350  @inproceedings{lin-etal-2024-indivec,\n    tit...   
60   @inproceedings{miranda-2024-allen,\n    title ...   

                                           target_text  
671  Shawly Ahsan, Eftekhar Hossain, Omar Sharif, A...  
286  Neil Shah, Saiteja Kosgi, Vishal Tambrahalli, ...  
728  Devika K, Hariprasath .s.b, Haripriya B, Vigne...  
434  Maxime Fily, Guillaume Wisniewski, Severine Gu...  
734  Premjith B, J

In [4]:
# Tokenisierung
tokenizer = T5Tokenizer.from_pretrained('t5-small')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
def preprocess_function(examples):
    inputs = examples['input_text']
    inputs = ['translate bibliographyEntry to BibTeX:' + doc for doc in examples['input_text']]
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [15]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = val_dataset.map(preprocess_function, batched=True)





Map:   0%|          | 0/616 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

In [16]:
print(tokenized_train_dataset[1])

{'input_text': 'Nikolaos Aletras and Orphee De Clercq, editors. Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, St. Julians, Malta, March 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.eacl-demo.0.\n', 'target_text': '@proceedings{eacl-2024-european,\n    title = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations",\n    editor = "Aletras, Nikolaos  and\n      De Clercq, Orphee",\n    month = mar,\n    year = "2024",\n    address = "St. Julians, Malta",\n    publisher = "Association for Computational Linguistics",\n    url = "https://aclanthology.org/2024.eacl-demo.0",\n}\n', 'input_ids': [13959, 24765, 5984, 16924, 651, 12, 3, 27915, 382, 15, 4, 10, 567, 12027, 521, 32, 7, 15345, 1313, 7, 11, 955, 102, 88, 15, 374, 4779, 49, 75, 1824, 6, 18008, 5, 24569, 13, 8, 507, 189, 4379, 13, 8, 161

In [17]:
import torch
print(torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.to(device)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    fp16=True,
)



trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


True


In [18]:
trainer.train()

# Testen
test_input = """Brown, B. (2021). New findings. Important Journal, 10(2), 200-220."""
inputs = tokenizer.encode("translate bibliographyEntry to BibTeX:" + test_input, return_tensors="pt", max_length=512, truncation=True).to(device)
outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Epoch,Training Loss,Validation Loss
1,No log,1.641047
2,No log,1.138298
3,No log,0.888591
4,No log,0.722874
5,No log,0.605539
6,No log,0.531891
7,No log,0.484598
8,No log,0.4532
9,No log,0.436056
10,No log,0.429324


BibliographyEntry to BibTeX:Brown, B. (2021), New findings, Important Journal, 10(2), 200-220.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select) -> Dieser Fehler tritt auf, wenn Tensoren auf verschiedenen Geräten (z. B. CPU und GPU) platziert sind und eine Operation ausgeführt wird, die Tensoren erwartet, die alle auf dem gleichen Gerät sind. Um diesen Fehler zu beheben, müssen wir sicherstellen, dass sowohl das Modell als auch die Tensoren (inputs) auf demselben Gerät (CPU oder GPU) sind.

In [25]:
test_input = """Brown, B. (2021). New findings. Important Journal, 10(2), 200-220."""
inputs = tokenizer.encode("translate bibliographyEntry to BibTeX:" + test_input, return_tensors="pt", max_length=512, truncation=True).to(device)
outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0]))

<pad> BibliographyEntry to BibTeX:Brown, B. (2021), New findings, Important Journal, 10(2), 200-220.</s>
