In [32]:
# !pip install transformers
# !pip install datasets
# !pip install nltk



# Import required libraries

In [1]:
import numpy as np
from transformers import AutoModelWithLMHead, AutoConfig, Trainer, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments
import os
from datasets import load_dataset, Dataset

# Load dataset

In [2]:
file_name = "data\dr_seuss.txt"
#dataset = load_dataset('text', data_files=file_name)

In [3]:
def split_lyrics_file(text_file):
    text = open(text_file, encoding='utf-8').read()
    text = text.split("\n")
    while "" in text:
        text.remove("")
    return text

In [4]:
lines = split_lyrics_file(file_name)
dict_lines = {"lines": lines}
dataset = Dataset.from_dict(dict_lines)

In [5]:
dataset[1]

{'lines': 'By Dr. Seuss'}

# Create Tokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [24]:
encoded = tokenizer("Hello, I am a single sentence")
print(encoded)

{'input_ids': [15496, 11, 314, 716, 257, 2060, 6827], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [54]:
def tokenize_function(examples):
    return tokenizer(examples["lines"], padding="max_length", truncation=True)

# Create Datasets

In [55]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [71]:
temp = tokenized_datasets.shuffle()

In [70]:
print(tokenized_datasets["lines"][2])
print(tokenized_datasets["input_ids"][2])

The sun did not shine.
[464, 4252, 750, 407, 18340, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [72]:
print(temp["lines"][2])
print(temp["input_ids"][2])

Fox in socks on box on Knox.
[19399, 287, 24359, 319, 3091, 319, 27633, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [73]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(100))
full_train_dataset = tokenized_datasets

# Define Model

In [74]:
model = AutoModelWithLMHead.from_pretrained("gpt2")

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\ieliz/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_le

In [76]:
# Fine-tune with author text data
training_args = TrainingArguments("test_trainer")
trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_train_dataset
)
trainer.train()

#TO DO: Change eval_dataset
#TO DO: Change training data to reconstruction(?)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 402653184 bytes. Buy new RAM!


# Paraphrasing approach is via reconstruction
* Create dummy dataset; create 'corrupted' Source sentence input/Reference sentence by removing stop word and add noise.
* Then try to reconstruct Reference sentence from source sentence

# Create Source sentences from Reference
* Remove stopwords
* Remove 20% of words
* Shuffle 20% of words

# Fine-tune model
* Use pre-trained GPT-2 checkpoint from HuggingFace Library
* Fine-tune model on reconstruction task
* Concatenate Source and Reference sequence separated by special symbol to form input sequence

In [None]:
def modelTrainer(text_path, epochs, model='gpt2', batch_size=9, cache_dir='cache'):
    model = AutoModelWithLMHead.from_pretrained(model)
    tokenizer = AutoTokenizer.from_pretra

#https://datachef.co/blog/paraphrasing-with-gpt2/
#https://arxiv.org/pdf/2006.05477.pdf
#https://github.com/BH-So/unsupervised-paraphrase-generation
#https://huggingface.co/transformers/training.html