In [1]:
from transformers import AutoTokenizer, TFAutoModelForCausalLM, AutoModelForCausalLM, \
                         TFAutoModelForQuestionAnswering, TrainingArguments, Trainer, \
                         DataCollatorForLanguageModeling
import tensorflow as tf
import pandas as pd
import numpy as np
import datasets
import torch

In [2]:
# Avoid out of memory errors
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Get Pretrained Model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
input_text = '''Jarvis's Persona: An AI assistant that is calm, sophisticated, and dependable with a touch of dry wit
<START>
You: Hey Jarvis you ready?
[CHARACTER]:'''

tokens = tokenizer.encode(input_text, return_tensors='pt')

output = model.generate(tokens, max_length=75, temperature=0.8, do_sample=True)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Jarvis's Persona: An AI assistant that is calm, sophisticated, and dependable with a touch of dry wit
<START>
You: Hey Jarvis you ready?
[CHARACTER]: Thanks for taking this opportunity to test the Persona experience. I've started thinking about doing certain things that I don't want to do, or that need to change.


# Collect Data

In [6]:
query = []
response = []

with open("D:/Users/Natha/Datasets/MyJarvisConversation/conversation.txt", "r") as f:
    for line in f.readlines():
        if line[0] == "U":
            query.append(line[6:].split("\n")[0])
        elif line[0] == "J":
            response.append(line[8:].split("\n")[0])
        else:
            pass

In [7]:
tokenizer.add_tokens(query + response)

395

In [8]:
data = {"query": query,
        "response": response}

In [9]:
dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=data))

In [10]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer([" ".join(x) for x in examples["response"]])

In [11]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

In [12]:
tokenized_ds = dataset.map(tokenizer_wrapper.tokenize_function,
                           batched=True,
                           num_proc=4,
                           remove_columns=dataset.column_names)

Map (num_proc=4):   0%|          | 0/241 [00:00<?, ? examples/s]

In [13]:
block_size = 128

In [14]:
class PreprocessWrapper():
    def __init__(self, block_size):
        self.block_size = block_size
        
    def group_texts(self, examples):
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= self.block_size:
            total_length = (total_length // self.block_size) * self.block_size
        # Split by chunks of block_size.
        result = {
            k: [t[i : i + self.block_size] for i in range(0, total_length, self.block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

In [15]:
preprocess_wrapper = PreprocessWrapper(block_size)

In [36]:
tokenized_ds[0]["input_ids"]

[32,
 50414,
 82,
 50414,
 50414,
 50414,
 64,
 50414,
 75,
 50414,
 86,
 50414,
 64,
 50414,
 88,
 50414,
 82,
 50414,
 11,
 50414,
 50414,
 50414,
 64,
 50414,
 50414,
 50414,
 79,
 50414,
 75,
 50414,
 68,
 50414,
 64,
 50414,
 82,
 50414,
 84,
 50414,
 81,
 50414,
 68,
 50414,
 50414,
 50414,
 83,
 50414,
 78,
 50414,
 50414,
 50414,
 82,
 50414,
 68,
 50414,
 68,
 50414,
 50414,
 50414,
 88,
 50414,
 78,
 50414,
 84,
 50414,
 50414,
 50414,
 82,
 50414,
 72,
 50414,
 81]

In [16]:
lm_dataset = tokenized_ds.map(preprocess_wrapper.group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/241 [00:00<?, ? examples/s]

In [17]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")

In [18]:
tf_train_set = model.prepare_tf_dataset(
    lm_dataset,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Train Model

In [24]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [25]:
model.compile(optimizer=optimizer)

In [37]:
model.fit(tf_train_set, epochs=5)

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node 'tfgpt2lm_head_model/transformer/assert_less/Assert/Assert' defined at (most recent call last):
    File "C:\Users\Natha\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\Natha\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\Natha\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\Natha\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\Natha\anaconda3\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "C:\Users\Natha\anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "C:\Users\Natha\anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
      await result
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\Natha\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\Natha\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell
      result = self._run_cell(
    File "C:\Users\Natha\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell
      return runner(coro)
    File "C:\Users\Natha\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\Natha\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\Natha\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\Natha\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Natha\AppData\Local\Temp\ipykernel_1096\1326729149.py", line 1, in <cell line: 1>
      model.fit(tf_train_set, epochs=5)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 1658, in train_step
      y_pred = self(x, training=True)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 805, in run_call_with_unpacked_inputs
      try:
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 837, in call
      transformer_outputs = self.transformer(
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\Natha\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 805, in run_call_with_unpacked_inputs
      try:
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 437, in call
      if inputs_embeds is None:
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 438, in call
      check_embeddings_within_bounds(input_ids, self.config.vocab_size)
    File "C:\Users\Natha\anaconda3\lib\site-packages\transformers\tf_utils.py", line 161, in check_embeddings_within_bounds
      tf.debugging.assert_less(
Node: 'tfgpt2lm_head_model/transformer/assert_less/Assert/Assert'
assertion failed: [The maximum value of input_ids (Tensor(\"tfgpt2lm_head_model/transformer/Max:0\", shape=(), dtype=int32)) must be smaller than the embedding layer\'s input dimension (50257). The likely cause is some problem at tokenization time.] [Condition x < y did not hold element-wise:] [x (tfgpt2lm_head_model/transformer/Reshape:0) = ] [[64 50414 84...]...] [y (tfgpt2lm_head_model/transformer/Cast_1/x:0) = ] [50257]
	 [[{{node tfgpt2lm_head_model/transformer/assert_less/Assert/Assert}}]] [Op:__inference_train_function_58687]