In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from tqdm.auto import tqdm
from nltk import sent_tokenize
from more_itertools import windowed
from random import choice, seed
tqdm.pandas()

In [3]:
corpus = pd.read_csv("../data/rom_real_dataset_final.csv")
corpus = corpus[["author", "title", "epoch", "text"]]

In [4]:
corpus_train = pd.concat([
    corpus.query("epoch == 'romantik'").sample(15, random_state=42),
    corpus.query("epoch == 'realismus'").sample(15, random_state=42)
])
corpus_test = corpus.drop(corpus_train.index)

with open("../data/nsp_corpus_train_idx.txt", "w") as f:
    f.write("\n".join(map(str, corpus_train.index)))

with open("../data/nsp_corpus_test_idx.txt", "w") as f:
    f.write("\n".join(map(str, corpus_test.index)))

In [5]:
def make_nsp_dataset(df, random_state=42):
    seed(random_state)
    data = []
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Creating NSP dataset"):
        text = row.pop("text")
        sents = sent_tokenize(text, language="german")
        for pair in windowed(sents, 2):
            sent_1, sent_2 = pair 
            
            true_pair = " ".join(pair)
            true_entry = row.copy()
            true_entry["pair"] = true_pair
            true_entry["sent1"] = sent_1
            true_entry["sent2"] = sent_2
            true_entry["label"] = 0
            data.append(true_entry)
            
            # chose any sent from same text that is not sent_2
            while (false_sent := choice(sents)) == sent_2:
                pass
            
            false_pair = " ".join((sent_1, false_sent))
            false_entry = row.copy()
            false_entry["pair"] = false_pair
            false_entry["sent1"] = sent_1
            false_entry["sent2"] = false_sent
            false_entry["label"] = 1
            data.append(false_entry)

    return pd.DataFrame.from_records(data)    

In [6]:
#train_dataset = make_nsp_dataset(corpus_train)
#test_dataset = make_nsp_dataset(corpus_test)

In [7]:
#train_dataset.to_csv("../data/nsp_traindataset.csv", index=False)
#test_dataset.to_csv("../data/nsp_testdataset.csv", index=False)

In [8]:
train_dataset = pd.read_csv("../data/nsp_traindataset.csv")
test_dataset = pd.read_csv("../data/nsp_testdataset.csv")

In [9]:
train_dataset.label.value_counts()

0    47136
1    47134
Name: label, dtype: int64

In [10]:
test_dataset.label.value_counts()

0    381143
1    381143
Name: label, dtype: int64

In [11]:
# Create SBert Model

from sentence_transformers import SentenceTransformer, models
from torch import nn

word_embedding_model = models.Transformer('bert-base-german-cased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=384, activation_function=nn.Tanh())


model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model, dense_model],
    device="cuda:0"
)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Set params

NUM_EPOCHS = 5

LR = 3e-5

TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 16

WARMUP_STEPS = 250

EVAL_STEPS = 2000

OUTPUT_PATH = "nsp_classif"

CPKT_PATH = "nsp_classif/sbert"
SAVE_STEPS = 5000
TOTAL_SAVE_LIMIT = 5

In [13]:
# Make dataset

from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.02, random_state=42)

In [14]:
from sentence_transformers import losses, InputExample, SentencesDataset
from torch.utils.data import DataLoader

def make_sbert_dataset(dataset):
    examples = [
        InputExample(texts=[row["sent1"], row["sent2"]], label=row["label"])
        for _, row in dataset.iterrows()
    ]
    sbert_dataset = SentencesDataset(examples, model)
    return sbert_dataset

In [15]:
train_dataloader = DataLoader(make_sbert_dataset(train_dataset), shuffle=True, batch_size=TRAIN_BATCH_SIZE)
val_dataloader = DataLoader(make_sbert_dataset(val_dataset), shuffle=True, batch_size=VAL_BATCH_SIZE)


In [16]:
# Initiate loss

train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2,
    concatenation_sent_rep=True
)


In [17]:
# Start training

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    #evaluator=evaluator,
    epochs=NUM_EPOCHS,
    warmup_steps=WARMUP_STEPS,
    optimizer_params={"lr": LR},
    #evaluation_steps=EVAL_STEPS,
    output_path=OUTPUT_PATH,
    save_best_model=True,
    checkpoint_path=CPKT_PATH,
    checkpoint_save_steps=SAVE_STEPS,
    checkpoint_save_total_limit=TOTAL_SAVE_LIMIT
)



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11548 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11548 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11548 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11548 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11548 [00:00<?, ?it/s]

In [18]:
import torch
# the "loss" is actually a nn.module containing the weights of the classifier (and the actual model as submodule...)
torch.save(train_loss, "nsp_classif/train_loss_state.torch")

In [19]:
import numpy as np
# TODO morgen
def predict(examples, softmax_loss):
    true, pred = [], []
    embeddings = []
    for example in tqdm(examples, desc="Predicting..."):
        texts = example.texts
        features = [
            softmax_loss.model.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length").to(softmax_loss.model.device)
            for text in texts
        ]
        embs, output = softmax_loss(features, labels=None)
        true.append(example.label)
        pred.append(output.detach().cpu().numpy().reshape(-1).argmax())
        embeddings.append(np.array([e.detach().cpu().numpy() for e in embs]))
    return np.array(true), np.array(pred), embeddings

In [20]:
true, pred, embeddings = predict(
    make_sbert_dataset(val_dataset),
    train_loss)

Predicting...:   0%|          | 0/1886 [00:00<?, ?it/s]

In [21]:
from sklearn.metrics import classification_report

print(classification_report(true, pred))

              precision    recall  f1-score   support

           0       0.69      0.67      0.68       941
           1       0.68      0.69      0.69       945

    accuracy                           0.68      1886
   macro avg       0.68      0.68      0.68      1886
weighted avg       0.68      0.68      0.68      1886

