In [3]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf
import pandas as pd

In [20]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to eos token
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
# Optional: Freeze certain layers
for layer in model.transformer.h[:8]:  # Freeze the first 8 hidden layers
    layer.trainable = False

In [7]:
# Load your custom dataset
data = pd.read_csv("Conversation.csv")
conversations = [(row['question'], row['answer']) for _, row in data.iterrows()]

In [24]:
input_texts = [q + " " + a for q, a in conversations]
input_encodings = tokenizer(input_texts, padding="max_length", max_length=40, return_tensors="tf", truncation=True)

In [25]:

input_ids = input_encodings["input_ids"]
attention_masks = input_encodings["attention_mask"]

In [26]:
# Shift input ids for labels
labels = tf.roll(input_ids, shift=-1, axis=1)
labels = tf.where(labels == tokenizer.pad_token_id, -100, labels)

In [32]:
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": input_ids,
    "attention_mask": attention_masks
}, labels))
dataset = dataset.shuffle(len(input_ids)).batch(8)

In [30]:
# Define custom loss function
def custom_loss(y_true, y_pred):
    mask = tf.not_equal(y_true, -100)  # Create a mask where -100 labels are ignored
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    mask = tf.cast(mask, loss.dtype)  # Ensure mask is the same dtype as loss
    loss = loss * mask  # Apply mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)  # Compute average over non-ignored tokens

In [31]:
# Compile the model with custom loss
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
model.compile(optimizer=optimizer, loss=loss)

In [33]:
# Train the model
model.fit(dataset, epochs=3)  # Adjust epochs based on dataset size and performance

Epoch 1/3


InvalidArgumentError: Graph execution error:

Detected at node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 641, in run_forever

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\ENVY 13TH GEN\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\ENVY 13TH GEN\AppData\Local\Temp\ipykernel_18772\3031969751.py", line 2, in <module>

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\modeling_tf_utils.py", line 1229, in fit

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1804, in fit

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1381, in step_function

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\modeling_tf_utils.py", line 1705, in train_step

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\modeling_tf_utils.py", line 1706, in train_step

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 269, in __call__

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 269, in __call__

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 277, in __call__

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\losses.py", line 143, in __call__

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\losses.py", line 270, in call

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\losses.py", line 2454, in sparse_categorical_crossentropy

  File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\backend.py", line 5777, in sparse_categorical_crossentropy

Received a label value of -100 which is outside the valid range of [0, 50257).  Label values: 373 8179 1804 1223 13 1312 16521 13 645 1917 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 72 11 3387 1560 502 13 2130 25436 503 2130 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 3919 338 407 616 8046 13 345 1422 470 1560 502 284 7765 345 510 13 880 11 1312 4001 284 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 5562 2911 1312 1282 736 355 257 3797 13 11875 423 4950 2951 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 72 4152 466 345 467 284 30 1312 467 284 38836 38047 1748 4152 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 10919 11 262 6729 318 1257 13 938 614 356 925 257 1263 6729 805 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 8505 345 4193 787 340 11841 13 644 466 345 765 502 284 466 30 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 13893 1549 1365 2822 534 7846 783 13 345 1276 307 26471 13 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 5832
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_80412]

In [8]:
# Prepare input-output pairs
input_ids = []
attention_masks = []
labels = []

In [9]:
# Set pad_token to eos_token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [10]:
for question, answer in conversations:
    input_text = question + tokenizer.eos_token  # Append EOS to each input
    label_text = answer + tokenizer.eos_token

    # Tokenize input and output
    input_encodings = tokenizer(input_text, return_tensors="tf", padding="max_length", max_length=40, truncation=True)
    label_encodings = tokenizer(label_text, return_tensors="tf", padding="max_length", max_length=40, truncation=True)

    input_ids.append(input_encodings.input_ids)
    attention_masks.append(input_encodings.attention_mask)
    labels.append(label_encodings.input_ids)

In [11]:
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
labels = tf.concat(labels, axis=0)

In [12]:
# Define the training dataset
dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks}, labels))
dataset = dataset.shuffle(len(input_ids)).batch(8)  # Adjust batch size as needed

In [13]:
# Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)  # Use built-in compute_loss

In [14]:
# Verify dataset batches
print("Checking dataset batches:")
for batch in dataset.take(1):
    print("Sample batch shapes:", batch[0]['input_ids'].shape, batch[1].shape)
    print("Batch data types:", batch[0]['input_ids'].dtype, batch[0]['attention_mask'].dtype, batch[1].dtype)

Checking dataset batches:
Sample batch shapes: (8, 40) (8, 40)
Batch data types: <dtype: 'int32'> <dtype: 'int32'> <dtype: 'int32'>


In [15]:
# Retry dataset creation to ensure proper types
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': input_ids,
        'attention_mask': attention_masks
    },
    labels
)).map(lambda x, y: (
    {
        'input_ids': tf.cast(x['input_ids'], tf.int32),
        'attention_mask': tf.cast(x['attention_mask'], tf.int32)
    },
    tf.cast(y, tf.int32)
))
dataset = dataset.shuffle(len(input_ids)).batch(8)

In [16]:
# Double-check dataset consistency after shuffling and batching
for batch in dataset.take(1):
    print("Verified batch shapes:", batch[0]['input_ids'].shape, batch[1].shape)

Verified batch shapes: (8, 40) (8, 40)


In [17]:
# Ensure model layers are trainable if desired
print("Checking layer trainability:")
for i, layer in enumerate(model.layers):
    print(f"Layer {i} ({layer.name}): trainable={layer.trainable}")

Checking layer trainability:
Layer 0 (transformer): trainable=True


In [18]:
# Attempt training again
try:
    model.fit(dataset, epochs=3)  # Adjust epochs based on dataset size and performance
except AttributeError as e:
    print("Error during training:", e)

Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported

Error during training: in user code:

    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\modeling_tf_utils.py", line 1706, in train_step  *
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    Fi

In [19]:
# Train the model
model.fit(dataset, epochs=3)  # Adjust epochs based on dataset size and performance

Epoch 1/3


AttributeError: in user code:

    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\modeling_tf_utils.py", line 1706, in train_step  *
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\losses.py", line 143, in __call__  *
        losses = call_fn(y_true, y_pred)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\losses.py", line 270, in call  *
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\modeling_tf_utils.py", line 1588, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\training.py", line 1207, in compute_loss  *
        y, y_pred, sample_weight, regularization_losses=self.losses
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\ENVY 13TH GEN\AppData\Local\Programs\Python\Python312\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 854, in match_dtype_and_rank  *
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'
