### Imports

In [1]:
from speakleash import Speakleash
import os
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F

from typing import Iterator, List

In [2]:
TRAINING_DATASET = "wolne_lektury_corpus"
RAW_DATASET_DIR = "./raw_data"
PREPARED_DATASET_DIR = "./prepared_data"

os.makedirs(RAW_DATASET_DIR, exist_ok=True)
os.makedirs(PREPARED_DATASET_DIR, exist_ok=True)

In [3]:
sl = Speakleash(RAW_DATASET_DIR)
training_speakleash_data = sl.get(TRAINING_DATASET)
docs = list(training_speakleash_data.data)

print(f"Documents count: {training_speakleash_data.documents}")

Documents count: 6619


In [4]:
def filter_document(doc: str) -> str:
    filtered_doc = ""
    lines = doc.split("\n")
    for text_line in lines:
        if len(text_line) > 20:
            filtered_doc += text_line + "\n"

    return filtered_doc


def save_text_data(path: str, docs: List[str]):
    text_data = "\n".join(docs)
    with open(path, "w", encoding="utf-8") as file:
        file.write(text_data)

    print(f"Saved data at: {path}")


def load_text_data(path: str) -> str:
    with open(path, "r", encoding="utf-8") as file:
        return file.read()

In [6]:
docs = list(training_speakleash_data.data)[:1200]
filtered_docs = [filter_document(doc) for doc in docs]

n = len(filtered_docs)

split_idx = int(0.90 * n)
train_docs = filtered_docs[:split_idx]
eval_docs = filtered_docs[split_idx:]

print(f"train_docs.len = {len(train_docs)}")
print(f"train_docs.len = {len(eval_docs)}")

save_text_data(path=os.path.join(PREPARED_DATASET_DIR, "train.txt"), docs=train_docs)
save_text_data(path=os.path.join(PREPARED_DATASET_DIR, "eval.txt"), docs=eval_docs)

train_docs.len = 1080
train_docs.len = 120
Saved data at: ./prepared_data/train.txt
Saved data at: ./prepared_data/eval.txt


In [None]:
train_text = load_text_data(os.path.join(PREPARED_DATASET_DIR, "train.txt"))
eval_text = load_text_data(os.path.join(PREPARED_DATASET_DIR, "eval.txt"))

In [None]:
model_name = "radlab/polish-gpt2-small-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)