In [1]:
import pandas as pd
import numpy as np
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          EvalPrediction, Trainer, TrainingArguments)
from tqdm.auto import tqdm
from nltk import sent_tokenize
from datasets import Dataset, DatasetDict

In [2]:
# Preprocessing
corpus = pd.read_csv("../data/rom_real_dataset_final.csv")
corpus = corpus[["author", "title", "epoch", "text", "pub_year_estim"]]
corpus["text"] = corpus.text.str.replace("[»›]", '"', regex=True)
corpus["text"] = corpus.text.str.replace("[«‹]", '"', regex=True)
corpus["text"] = corpus.text.str.replace("–", '-', regex=True)
corpus.text.str.contains("[»›]").any(), corpus.text.str.contains("[«‹]").any(), corpus.text.str.contains("–").any()

# Splitting
corpus_train = pd.concat([
    corpus.query("epoch == 'romantik'").sample(5, random_state=42),
    corpus.query("epoch == 'realismus'").sample(5, random_state=42)
])
corpus_test = corpus.drop(corpus_train.index)

In [3]:
model = AutoModelForSequenceClassification.from_pretrained("LennartKeller/longformer-gottbert-base-8192-aw512", num_labels=2, use_auth_token=True)
tokenizer = AutoTokenizer.from_pretrained("LennartKeller/longformer-gottbert-base-8192-aw512", use_auth_token=True)

Some weights of the model checkpoint at LennartKeller/longformer-gottbert-base-8192-aw512 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at LennartKeller/longformer-gottbert-base-8192-aw512 and are newly i

In [4]:
from sklearn.model_selection import train_test_split

def make_story_shuffled_dataset(corpus, tokenizer, perc=0.3, random_state=42):
    np.random.seed(random_state)
    data = []
    for index, row in corpus.iterrows():
        parts = []
        sents = sent_tokenize(row["text"])
        n_tokens = 0
        part_sents = []
        while index < len(sents):
            sent = sents[index]
            part_sents.append(sent)
            n_tokens += len(tokenizer(sent, add_special_tokens=False)["input_ids"])
            index += 1
            if n_tokens >= tokenizer.model_max_length:
                part_sents.pop(-1)
                parts.append(np.array(part_sents))
                index -= 1
                part_sents = []
                n_tokens = 0
        for part_index, part in enumerate(parts):
            orig_text = " ".join(part.tolist())
            # select per sent
            orig_index = np.arange(len(part))
            index_fixed, index_shuffle = train_test_split(orig_index, test_size=perc, random_state=random_state)
            sents_to_shuffle = part[index_shuffle]
            np.random.shuffle(sents_to_shuffle)
            part[index_shuffle] = sents_to_shuffle
            shuffled_text = " ".join(part)
            data.append({
                "part_index": part_index,
                "author": row["author"],
                "title": row["title"],
                "epoch": row["epoch"],
                "pub_year_estim": row["pub_year_estim"],
                "text": orig_text,
                "label": 0
            })
            data.append({
                "part_index": part_index,
                "author": row["author"],
                "title": row["title"],
                "epoch": row["epoch"],
                "pub_year_estim": row["pub_year_estim"],
                "text": shuffled_text,
                "label": 1
            })
    return pd.DataFrame.from_records(data)  

In [5]:
train_dataset = make_story_shuffled_dataset(corpus_train, tokenizer, perc=0.3)
test_dataset = make_story_shuffled_dataset(corpus_test, tokenizer, perc=0.3)

In [6]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:


hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_dataset),
    "test": Dataset.from_pandas(test_dataset)
})
hf_dataset.save_to_disk("../data/rom_rea_story_shuffle_hf_dataset")


hf_dataset = hf_dataset.map(
    lambda entry: tokenizer(entry["text"], padding="max_length", truncation=True),
    batched=True,
    num_proc=12
)

hf_dataset = hf_dataset.rename_column("label", "labels")
hf_dataset.set_format("torch")

hf_train_dataset = hf_dataset["train"]
hf_train_dataset = hf_train_dataset.train_test_split(train_size=0.90)
hf_train_dataset

training_args = TrainingArguments(
    num_train_epochs=5,
    output_dir="sf_story",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=3e-6,
    logging_dir="sf_story/logs",
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True,
    evaluation_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=5,
    overwrite_output_dir=True,
    warmup_steps=0,
    fp16=True,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=hf_train_dataset["train"],
    eval_dataset=hf_train_dataset["test"],
    compute_metrics=compute_metrics
)

# Training
trainer.train()
model.save_pretrained("longformer-sf")
tokenizer.save_pretrained("longformer-sf")




Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: title, text, author, part_index, pub_year_estim, epoch.
***** Running training *****
  Num examples = 279
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 1395
Initializing global attention on CLS token...


Step,Training Loss,Validation Loss,Accuracy
25,0.7217,0.699423,0.451613
50,0.6765,0.686177,0.548387
75,0.723,0.706622,0.451613
100,0.7112,0.685389,0.548387
125,0.7419,0.687547,0.548387
150,0.669,0.684901,0.548387
175,0.6909,0.696336,0.548387
200,0.6222,0.770366,0.548387
225,0.9945,0.720491,0.548387
250,0.7792,0.684429,0.548387


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

In [None]:
# Prediction
predictions = trainer.predict(hf_dataset["test"])
test_dataset = hf_dataset["test"].to_pandas()
test_dataset["pred"] = predictions.predictions.argmax(axis=1)
test_dataset["match"] = (test_dataset["labels"] == test_dataset["pred"]).astype("int")

In [None]:
test_dataset.to_csv("../data/testdataset_longformer_sf.csv", index=False)

In [11]:
test_dataset.groupby("epoch").match.mean()

epoch
realismus    0.931667
romantik     0.778502
Name: match, dtype: float64

In [12]:
### Test shuffling

In [None]:
def train(perc):
    train_dataset = make_story_shuffled_dataset(corpus_train, tokenizer, perc=perc)
    test_dataset = make_story_shuffled_dataset(corpus_test, tokenizer, perc=perc)
    hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_dataset),
    "test": Dataset.from_pandas(test_dataset)
    })
    hf_dataset.save_to_disk("../data/rom_rea_story_shuffle_hf_dataset")


    hf_dataset = hf_dataset.map(
        lambda entry: tokenizer(entry["text"], padding="max_length", truncation=True),
        batched=True,
        num_proc=12
    )

    hf_dataset = hf_dataset.rename_column("label", "labels")
    hf_dataset.set_format("torch")

    hf_train_dataset = hf_dataset["train"]
    hf_train_dataset = hf_train_dataset.train_test_split(train_size=0.90)
    hf_train_dataset

    training_args = TrainingArguments(
        num_train_epochs=5,
        output_dir="sf_story",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=3e-6,
        logging_dir="sf_story/logs",
        logging_strategy="steps",
        logging_steps=10,
        logging_first_step=True,
        evaluation_strategy="steps",
        eval_steps=25,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=5,
        overwrite_output_dir=True,
        warmup_steps=0,
        fp16=True,
        gradient_checkpointing=True
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=hf_train_dataset["train"],
        eval_dataset=hf_train_dataset["test"],
        compute_metrics=compute_metrics
    )
    # Prediction
    predictions = trainer.predict(hf_dataset["test"])
    test_dataset = hf_dataset["test"].to_pandas()
    test_dataset["pred"] = predictions.predictions.argmax(axis=1)
    test_dataset["match"] = (test_dataset["labels"] == test_dataset["pred"]).astype("int")
    test_dataset = test_dataset.drop(columns=["input_ids", "attention_mask"])
    test_dataset.to_csv(f"../data/testdataset_longformer_sf_{perc}.csv", index=False)
    return test_dataset["match"].copy()

In [13]:
test_dataset.sort_values"

Unnamed: 0,part_index,author,title,epoch,pub_year_estim,text,labels,input_ids,attention_mask,pred,match
0,0,"Alexis,-Willibald",Die Hosen des Herrn von Bredow,realismus,1846.0,1. Kapitel. Die Herbstwäsche\n \n \n \n \n \n ...,0,"[0, 159, 4, 4284, 4, 40, 1865, 8973, 51963, 75...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1
1,0,"Alexis,-Willibald",Die Hosen des Herrn von Bredow,realismus,1846.0,1. Kapitel. Die Herbstwäsche\n \n \n \n \n \n ...,1,"[0, 159, 4, 4284, 4, 40, 1865, 8973, 51963, 75...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1
2,1,"Alexis,-Willibald",Die Hosen des Herrn von Bredow,realismus,1846.0,"Nun ja, lieber Gott, wir haben kein Schloß Fri...",0,"[0, 3197, 170, 5, 1662, 1126, 5, 52, 62, 297, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1
3,1,"Alexis,-Willibald",Die Hosen des Herrn von Bredow,realismus,1846.0,"Nun ja, lieber Gott, wir haben kein Schloß Fri...",1,"[0, 3197, 170, 5, 1662, 1126, 5, 52, 62, 297, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1
4,2,"Alexis,-Willibald",Die Hosen des Herrn von Bredow,realismus,1846.0,Vor Schrecken war der Anne Susanne der Silberr...,0,"[0, 1218, 15542, 67, 8, 12052, 13753, 8, 3086,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1
...,...,...,...,...,...,...,...,...,...,...,...
1209,5,"Waiblinger,-Wilhelm",Die Briten in Rom,romantik,1828.0,"Es stiegen wohl zuweilen Zweifel in ihm auf, w...",1,"[0, 392, 20610, 421, 27089, 4839, 10, 329, 18,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1
1210,0,"Waiblinger,-Wilhelm",Das Märchen von der blauen Grotte,romantik,1828.0,"Zuerst will ich Euch den Titel sagen, den mein...",0,"[0, 20597, 320, 32, 1591, 13, 1227, 612, 5, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1
1211,0,"Waiblinger,-Wilhelm",Das Märchen von der blauen Grotte,romantik,1828.0,"Zuerst will ich Euch den Titel sagen, den mein...",1,"[0, 20597, 320, 32, 1591, 13, 1227, 612, 5, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1
1212,1,"Waiblinger,-Wilhelm",Das Märchen von der blauen Grotte,romantik,1828.0,Da öffnete sich die Felswand über der Jungfrau...,0,"[0, 1255, 14626, 22, 7, 9848, 2284, 51, 8, 197...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1
