In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import torch
from datasets import load_dataset

dataset = load_dataset('text', data_files='./kant.txt')
dataset = dataset.filter(lambda sentence: len(sentence['text']) > 1)
print(dataset['train'][0])


Using custom data configuration default-abdf1a8c1a4047a2
Reusing dataset text (C:\Users\chris\.cache\huggingface\datasets\text\default-abdf1a8c1a4047a2\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\text\default-abdf1a8c1a4047a2\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-c0216063808f3258.arrow


{'text': 'The Project Gutenberg EBook of The Critique of Pure Reason, by Immanuel Kant'}


In [2]:
config = GPT2Config(
    vocab_size=50261,
    n_positions=256,
    n_embd=768,
    activation_function="gelu",
)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>', 'mask_token': '<MASK>'}
tokenizer.add_special_tokens(special_tokens_dict)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config, ignore_mismatched_sizes=True)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized because the shapes did not match:
- wte.weight: found shape torch.Size([50257, 768]) in the checkpoint and torch.Size([50261, 768]) in the model instantiated
- wpe.weight: found shape torch.Size([1024, 768]) in the checkpoint and torch.Size([256, 768]) in the model instantiated
- h.0.attn.bias: found shape torch.Size([1, 1, 1024, 1024]) in the checkpoint and torch.Size([1, 1, 256, 256]) in the model instantiated
- h.1.attn.bias: found shape torch.Size([1, 1, 1024, 1024]) in the checkpoint and torch.Size([1, 1, 256, 256]) in the model instantiated
- h.2.attn.bias: found shape torch.Size([1, 1, 1024, 1024]) in the checkpoint and torch.Size([1, 1, 256, 256]) in the model instantiated
- h.3.attn.bias: found shape torch.Size([1, 1, 1024, 1024]) in the checkpoint and torch.Size([1, 1, 256, 256]) in the model instantiated
- h.4.attn.bias: found shape torch.Size([1, 1, 1024, 1024]) 

In [3]:
def tokenize(batch):
    return tokenizer(str(batch), padding="max_length", truncation=True, max_length=256)

enc_ds = dataset.map(tokenize, batched=False)
print(enc_ds['train'][0])

Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\text\default-abdf1a8c1a4047a2\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-865895cce3062243.arrow


{'text': 'The Project Gutenberg EBook of The Critique of Pure Reason, by Immanuel Kant', 'input_ids': [90, 6, 5239, 10354, 705, 464, 4935, 20336, 412, 10482, 286, 383, 10056, 2350, 286, 17129, 23219, 11, 416, 1846, 18713, 29576, 6, 92, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 5

In [4]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, mlm_probability=0.15
)

In [5]:
train_args = TrainingArguments(
    output_dir="./GPTKant/",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=5000,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=enc_ds['train'],
)

In [None]:
trainer.train()
trainer.save_model()

In [10]:
model = GPT2LMHeadModel.from_pretrained("./GPTKant/")

loading configuration file ./GPTKant/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.2",
  "use_cache": true,
  "vocab_size": 50261
}

loading weights file ./GPTKant/pytorch_model.bin
All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHead

In [51]:
input = "Human thinking involves human"
tokenized_inputs = tokenizer(input, return_tensors="pt")
out = model.generate(input_ids=tokenized_inputs['input_ids'], attention_mask=tokenized_inputs['attention_mask'], max_length=256, num_beams=5, temperature=0.7, top_k=50, top_p=0.90, no_repeat_ngram_size=2) 
print(tokenizer.decode(out[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Human thinking involves human 'in the same time, that is, in the conception of the'},'}, and',' use of which we can be given in which, but of a thing, as an object, which it is the understanding,”; and that it may be found to the other hand, we must be in this case, namely, the possibility of all that reason is merely a priori, or that we cannot be regarded as we are'text, if we have a practical reason, it has not merely as the condition of pure reason that of reason.'}' nothing but the world, however, so far as a cognition of this reason—in one is not not only in time to this form of our understanding. But the one'' use'ib' connected with the existence of experience, while the latter', when it, is to be practical practical laws of its its own, I shall one one of an idea of what what we reason. For the internal' of' an' assign' just' its object is a word word, since the relation to objects of objects, without' contains' reason,' just as to its' absolutely that that the former, on the s