In [1]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, load_metric
from torch.utils.data import DataLoader
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np

os.getcwd()
os.chdir('c:\\Users\\Kirchner\\Desktop\\Masterarbeit')

In [2]:

import gc

gc.collect()

torch.cuda.empty_cache()

In [3]:
train_size = 50
def read_annotations(txt_name, mode):
    texts = []
    labels = []

    with open (txt_name, encoding = "utf-8", mode = "r+") as f:
        for line in f.readlines():
            texts.append(line.split("\t")[8])
            if mode == "opinion":
                labels.append(line.split("\t")[5])
            elif mode == "sentiment":
                labels.append(line.split("\t")[6])
            else:
                print(f"There is no label for {mode}")

    return texts,labels

In [4]:
meine_texte, meine_labels  = read_annotations("cleaned_annotated_data_training.txt", "opinion")

texts_train = meine_texte[:train_size]
texts_test = meine_texte[train_size:]
labels_train = meine_labels[:train_size]
labels_test = meine_labels[train_size:]

print(len(labels_train), len(labels_test))

50 37


In [5]:
if os.path.isdir("meine_tokenizer"):
    print("Loading finetuned tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("meine_tokenizer")
else:
    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")

if os.path.isdir("Sentiment_Bert_mit_spiegel"):
    print("Loading finetuned model...")
    model = AutoModelForSequenceClassification.from_pretrained("Sentiment_Bert_mit_spiegel")
else:
    model = AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert")

Loading finetuned tokenizer...
Loading finetuned model...


In [6]:
train_text_tokenized = tokenizer(texts_train, truncation=True, padding=True)

test_text_tokenized = tokenizer(texts_test, truncation=True, padding=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
class SpiegelDataset(Dataset):
    """ Spiegel Dataset """

    def __init__(self, tsv_file):
        self.sample = []

        with open (tsv_file, encoding = "utf-8", mode="r+") as f:
            for line in f.readlines():
                self.sample.append((line.split("\t")[5], line.split("\t")[8]))

    def __len__(self):
        return len(self.sample)

    def __getitem__(self, idx):
        return {"text": self.sample[idx][1], "label" : self.sample[idx][0]}
    

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding = True, truncation=True)
#def preprocess_function(examples):
#    return tokenizer(examples[1], truncation=True)

In [25]:
# spiegel = SpiegelDataset("cleaned_annotated_data_training.txt")

# spiegel_data = load_dataset("text", data_files={"train": "Spiegel_train.txt", "test" : "Spiegel_test.txt"})

Using custom data configuration default-b87b2cbca73061b8
Reusing dataset text (C:\Users\Kirchner\.cache\huggingface\datasets\text\default-b87b2cbca73061b8\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
100%|██████████| 2/2 [00:00<00:00, 795.81it/s]


In [9]:
text = []
labels = []

label_to_id = {"negative" : 0, "positive" : 1, "neutral" : 2}


with open ("cleaned_annotated_data_training.txt", encoding="utf-8", mode="r+") as f:
    for line in f.readlines():
        text.append(line.split("\t")[8])
        labels.append(label_to_id[line.split("\t")[5]])


train_dict = {"text": text, "labels" : labels}

In [10]:

test_dict = {"text" : ["Was eine Sauerei diese Maßnahmen!", "Ich finde das OK, ist aber nichts Neues."],
            "labels" : [0, 2]}
            

metric = load_metric("accuracy")

def compute_metrics_accuracy(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print("Predictions", predictions, "Labels", labels)
    return metric.compute(predictions=predictions, references=labels)


meine_dataset_train = Dataset.from_dict(train_dict)

mein_dataset_test = Dataset.from_dict(test_dict)


tokenized_dataset_train = meine_dataset_train.map(preprocess_function, batched= True)
tokenized_dataset_test = mein_dataset_test.map(preprocess_function, batched=True)

100%|██████████| 1/1 [00:00<00:00,  8.67ba/s]
100%|██████████| 1/1 [00:00<00:00, 504.55ba/s]


In [114]:
# tokenized_spiegel = spiegel_data.map(preprocess_function, batched=True)
tokenized_dataset_test[0]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
 'input_ids': [3, 2019, 155, 9878, 10, 620, 3406, 26982, 4, 0, 0, 0, 0],
 'labels': 0,
 'text': 'Was eine Sauerei diese Maßnahmen!',
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=20
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics = compute_metrics_accuracy
)

# trainer.train()
trainer.evaluate()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 1
  0%|          | 0/2 [00:00<?, ?it/s]

Predictions [1 1] Labels [0 2]


100%|██████████| 2/2 [00:01<00:00,  1.99it/s]


{'eval_loss': 7.7954559326171875,
 'eval_accuracy': 0.0,
 'eval_runtime': 0.283,
 'eval_samples_per_second': 7.067,
 'eval_steps_per_second': 7.067}

In [None]:
trainer.train()
model.save_pretrained("Sentiment_Bert_mit_spiegel")
tokenizer.save_pretrained("meine_tokenizer")

In [96]:
len(train_text_tokenized["attention_mask"])

50

In [143]:


# training_args = TrainingArguments(output_dir="./results",
#                     learning_rate=2e-5,
#                     per_device_eval_batch_size=4,
#                     per_device_train_batch_size=4,
#                     num_train_epochs=3,
#                     weight_decay=0.01
#                     )

# trainer = Trainer(
#             model=model,
#             args=training_args,
#             train_dataset=train_text_tokenized,
#             eval_dataset=test_text_tokenized,
#             tokenizer = tokenizer,
#             data_collator= data_collator
# )


# trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3



[A[A[A

AttributeError: 'list' object has no attribute 'keys'

In [None]:
train_text_tokenized

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb = imdb.map(preprocess_function, batched=True)