In [2]:
import pandas as pd
import tensorflow as tf

In [3]:
script = pd.read_csv('C:\\Users\\MIHIR\\work\\datasets\\friends\\Friends_script.csv')
script.head()

Unnamed: 0,Name,Lines
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."


In [4]:
print(script.isnull().sum())
script= script.dropna()

Name      1
Lines    32
dtype: int64


In [5]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("bert-base-cased")
def tokenize(script):
    print(tokenizer.encode(script['Lines']).ids)
tokenize(script.loc[0])

  from .autonotebook import tqdm as notebook_tqdm


[101, 1247, 112, 188, 1720, 1106, 1587, 106, 1124, 112, 188, 1198, 1199, 2564, 146, 1250, 1114, 106, 102]


In [6]:
mapping_token_to_index = {}
mapping_index_to_token = {}

def generateVocab(script):
    vocab = set()

    for line in script['Lines']:
        for word in line.split():
            clean_word = ''.join([char for char in word if char.isalpha()])
            if clean_word:
                vocab.add(clean_word.lower())

    vocab.update(["<bos>", "<eos>", "<unk>", "<pad>"])

    vocab = sorted(vocab)

    # Build mappings
    for index, token in enumerate(vocab):
        mapping_token_to_index[token] = index
        mapping_index_to_token[index] = token

    return vocab

def token_to_index(script):
    tokenized_script = [mapping_token_to_index["<bos>"]]

    clean_word = ""
    for char in script:
        if char == " ":
            if clean_word:
                tokenized_script.append(
                    mapping_token_to_index.get(clean_word.lower(), mapping_token_to_index["<unk>"])
                )
                clean_word = ""
        elif char in [".", "!", "?"]:
            if clean_word:
                tokenized_script.append(
                    mapping_token_to_index.get(clean_word.lower(), mapping_token_to_index["<unk>"])
                )
                clean_word = ""
            tokenized_script.append(mapping_token_to_index["<eos>"])
        elif char.isalpha():
            clean_word += char.lower()

    if clean_word:
        tokenized_script.append(
            mapping_token_to_index.get(clean_word.lower(), mapping_token_to_index["<unk>"])
        )

    if tokenized_script[-1] != mapping_token_to_index["<eos>"]:
        tokenized_script.append(mapping_token_to_index["<eos>"])

    return tokenized_script

def index_to_token(indexes):
    return [mapping_index_to_token.get(index, "<unk>") for index in indexes]



In [7]:
vocab = generateVocab(script)




print("Vocabulary size:", len(vocab))
print("First 10 words in vocabulary:", vocab[:10])

text = "Hi I am Ross."
encoded = token_to_index(text)
decoded = index_to_token(encoded)

print(f"Tokens for '{text}':", encoded)
print("Decoded tokens:", decoded)

Vocabulary size: 16192
First 10 words in vocabulary: ['<bos>', '<eos>', '<pad>', '<unk>', 'a', 'aa', 'aaaaaaaaaaaaaaaaaaaaaaaaaaadoption', 'aaaaaahhhh', 'aaaaah', 'aaaaahhh']
Tokens for 'Hi I am Ross.': [0, 6221, 6565, 378, 11708, 1]
Decoded tokens: ['<bos>', 'hi', 'i', 'am', 'ross', '<eos>']


In [8]:
text = "Hi I am Ross."
encoded = tokenizer.encode(text).ids
decoded = tokenizer.decode(encoded)

print(f"Tokens for '{text}':", encoded)
print("Decoded tokens:", decoded)
type(decoded)

Tokens for 'Hi I am Ross.': [101, 8790, 146, 1821, 5104, 119, 102]
Decoded tokens: Hi I am Ross.


str

In [9]:
class CombinedEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, num_characters, max_seq_len, embedding_dim):
        super().__init__()
        self.token_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.speaker_embedding = tf.keras.layers.Embedding(num_characters, embedding_dim)
        self.position_embedding = tf.keras.layers.Embedding(max_seq_len, embedding_dim)

    def call(self, token_ids, speaker_ids):
        seq_len = tf.shape(token_ids)[1]
        batch_size = tf.shape(token_ids)[0]

        # Token embeddings: (batch_size, seq_len, embedding_dim)
        token_embed = self.token_embedding(token_ids)

        # Speaker embeddings: (batch_size, embedding_dim) → (batch_size, 1, embedding_dim) → broadcast
        speaker_embed = self.speaker_embedding(speaker_ids)
        speaker_embed = tf.expand_dims(speaker_embed, axis=1)
        speaker_embed = tf.tile(speaker_embed, [1, seq_len, 1])  # (batch_size, seq_len, embedding_dim)

        # Position embeddings: (1, seq_len)
        positions = tf.range(start=0, limit=seq_len, delta=1)
        position_embed = self.position_embedding(positions)
        position_embed = tf.expand_dims(position_embed, axis=0)
        position_embed = tf.tile(position_embed, [batch_size, 1, 1])



        return token_embed + speaker_embed + position_embed


In [10]:
unique_speakers = sorted(set(script['Name']))

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

speaker_to_index = {name: i for i, name in enumerate(unique_speakers)}
tokenized_lines = []
speaker_indices = []
max_seq = 0

for i in range(len(script)):
    line = script['Lines'].iloc[i]
    name = script['Name'].iloc[i]
    line_encode = token_to_index(line)
    tokenized_lines.append(line_encode)
    if name in speaker_to_index:
        speaker_idx = speaker_to_index[name]
        speaker_indices.append(speaker_idx)
    else:
        speaker_indices.append(0)
    max_seq = max(len(line_encode), max_seq)

token_ids_padded = pad_sequences(tokenized_lines, maxlen=max_seq, padding='post')

token_ids_tensor = tf.constant(token_ids_padded, dtype=tf.int32)
speaker_ids_tensor = tf.constant(speaker_indices, dtype=tf.int32)

In [12]:
max_seq

246

In [13]:
def create_padding_mask(seq):
    # Create a mask for the padding values (assuming 0 is your padding index)
    seq = tf.cast(tf.math.equal(seq, mapping_token_to_index["<pad>"]), tf.float32) # Use "<pad>" index
    # Add extra dimensions to add the padding to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [14]:
tf.shape(token_ids_tensor)
tf.shape(speaker_ids_tensor)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([55267])>

In [15]:
emb = CombinedEmbedding(len(vocab),len(unique_speakers),246,256)
# output = emb(token_ids_tensor, speaker_ids_tensor)

In [16]:
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self,MHA_num_heads, emb_dim, ff_dim , dropout_rate = 0.1):
        super().__init__()
        self.MHA = tf.keras.layers.MultiHeadAttention(
            num_heads=MHA_num_heads,
            key_dim=emb_dim // MHA_num_heads, # key_dim per head
            output_shape=emb_dim # Ensure MHA output matches emb_dim
        )
        self.ff = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation = 'relu'),
            tf.keras.layers.Dense(emb_dim) # Ensure FF output matches emb_dim
        ])
        self.layernom1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.layernom2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self , x, train, attention_mask=None):
        # Add shape assertion here
        tf.debugging.assert_shapes([(x, ['batch', 'seq', 'emb_dim'])], message="Input shape to DecoderBlock is incorrect")

        attention_output = self.MHA(query = x, value = x , key = x, attention_mask = attention_mask, training = train)
        # Optional: Add assertion for shape if debugging, e.g.
        # tf.debugging.assert_equal(tf.shape(attention_output)[-1], tf.shape(x)[-1], message="MHA output dim mismatch")
        attention_output = self.dropout1(attention_output,training = train)
        out1 = self.layernom1(x + attention_output)

        ff_output = self.ff(out1)
        ff_output = self.dropout2(ff_output, training = train)
        out2 = self.layernom2(out1 + ff_output)

        # REMOVE THE DENSE LAYER PROJECTING TO VOCAB SIZE FROM HERE
        # logits = tf.keras.layers.Dense(len(vocab))(out2)
        return out2 # Return the tensor with emb_dim


class DecoderTransformer(tf.keras.Model):
    def __init__(self,vocab_size,num_characters,max_seq,embedding_dim,MHA_num_heads, ff_dim, num_decoder_blocks, dropout_rate=0.1):
        super().__init__()
        self.embedding = CombinedEmbedding(vocab_size, num_characters, max_seq, embedding_dim)
        self.decoder_blocks= [DecoderBlock(MHA_num_heads, embedding_dim, ff_dim, dropout_rate) for _ in range(num_decoder_blocks)]
        self.final_dense = tf.keras.layers.Dense(vocab_size) # MOVED THE DENSE LAYER HERE

    def call(self,token_ids_tensor,speaker_ids_tensor,attention_mask):
        x = self.embedding(token_ids_tensor,speaker_ids_tensor)
        # Add assertion here to check the shape after embedding
        tf.debugging.assert_shapes([(x, ['batch', 'seq', 'emb_dim'])], message="Shape after embedding is incorrect")


        for i, decoder_block in enumerate(self.decoder_blocks):
            x = decoder_block(x,train=True, attention_mask = attention_mask) # x maintains embedding_dim
            # Add assertion here to check the shape after each decoder block
            tf.debugging.assert_shapes([(x, ['batch', 'seq', 'emb_dim'])], message=f"Shape after decoder block {i} is incorrect")


        logits = self.final_dense(x) # Apply final projection to vocab_size
        return logits

In [100]:
from tensorflow.keras.optimizers import Adam
batch_size = 16
dataset = tf.data.Dataset.from_tensor_slices((token_ids_tensor, speaker_ids_tensor))
dataset = dataset.shuffle(buffer_size=len(script))
batched_dataset = dataset.batch(batch_size)
optimizer = Adam(learning_rate=0.0001)

model = DecoderTransformer(
    vocab_size=len(vocab),
    num_characters=len(unique_speakers),
    max_seq=max_seq,
    embedding_dim=128,
    MHA_num_heads=8,
    ff_dim=1024,
    num_decoder_blocks=3,
    dropout_rate=0.1
)

In [23]:
from tensorflow import keras

# Load the previously saved model
loaded_model = tf.keras.models.load_model('decoder_transformer_model.keras')

TypeError: Could not locate class 'DecoderTransformer'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'DecoderTransformer', 'config': {'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}}, 'registered_name': 'DecoderTransformer', 'build_config': {'input_shape': [16, 245]}}

TypeError: <class '__main__.DecoderTransformer'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': None, 'class_name': 'DecoderTransformer', 'config': {'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}}, 'registered_name': 'DecoderTransformer', 'build_config': {'input_shape': [16, 245]}, 'compile_config': None}.

Exception encountered: Unable to revive model from config. When overriding the `get_config()` method, make sure that the returned config contains all items used as arguments in the  constructor to <class '__main__.DecoderTransformer'>, which is the default behavior. You can override this default behavior by defining a `from_config(cls, config)` class method to specify how to create an instance of DecoderTransformer from its config.

Received config={'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}}

Error encountered during deserialization: DecoderTransformer.__init__() got an unexpected keyword argument 'trainable'

In [None]:
model = tf.keras

In [131]:
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    total_loss_epoch = []
    for step, (batch_token_ids, batch_speaker_ids) in enumerate(batched_dataset):
        with tf.GradientTape() as tape:
            input_batch_token_ids = batch_token_ids[:,:-1]
            output_batch_token_ids = batch_token_ids[:,1:]
            padding_mask = create_padding_mask(input_batch_token_ids)

            predicted_token = model(
                input_batch_token_ids,
                batch_speaker_ids,
                attention_mask=padding_mask,
                training=True # Ensure this is True
            )
            cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            loss = cross_entropy(output_batch_token_ids, predicted_token)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        total_loss_epoch.append(loss.numpy())
        if step % 100 == 0: # Print loss every 100 steps
             print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.numpy()}")
    avg_loss = sum(total_loss_epoch)/len(total_loss_epoch) if total_loss_epoch else 0
    print(f"Epoch {epoch+1} Average Loss: {avg_loss}")

Epoch 1/10
Epoch 1, Step 0, Loss: 0.18550138175487518
Epoch 1, Step 100, Loss: 0.38539257645606995
Epoch 1, Step 200, Loss: 0.17803947627544403
Epoch 1, Step 300, Loss: 0.23870041966438293
Epoch 1, Step 400, Loss: 0.203701913356781
Epoch 1, Step 500, Loss: 0.23107855021953583
Epoch 1, Step 600, Loss: 0.24999825656414032
Epoch 1, Step 700, Loss: 0.15560653805732727
Epoch 1, Step 800, Loss: 0.1574213206768036
Epoch 1, Step 900, Loss: 0.33797749876976013
Epoch 1, Step 1000, Loss: 0.24488836526870728
Epoch 1, Step 1100, Loss: 0.33744415640830994
Epoch 1, Step 1200, Loss: 0.21580885350704193
Epoch 1, Step 1300, Loss: 0.22068382799625397
Epoch 1, Step 1400, Loss: 0.21141791343688965
Epoch 1, Step 1500, Loss: 0.21258792281150818
Epoch 1, Step 1600, Loss: 0.14725932478904724
Epoch 1, Step 1700, Loss: 0.16490058600902557
Epoch 1, Step 1800, Loss: 0.2049277424812317
Epoch 1, Step 1900, Loss: 0.1888989508152008
Epoch 1, Step 2000, Loss: 0.2072296142578125
Epoch 1, Step 2100, Loss: 0.2594032883644

KeyboardInterrupt: 

In [132]:
model.save('decoder_transformer_model.keras')

In [133]:
import numpy as np
def generate(start_string, character_name, max_generate_length):
    # 1. Preprocess inputs
    if character_name not in speaker_to_index:
        print(f"Speaker '{character_name}' not found. Using default speaker index 0.")
        speaker_idx = 0
    else:
        speaker_idx = speaker_to_index[character_name]

    speaker_id_tensor = tf.constant([speaker_idx], dtype=tf.int32)
    input_tokens = token_to_index(start_string)

    generated_sequence_ids = list(input_tokens)
    if generated_sequence_ids[-1] == mapping_token_to_index["<eos>"]:
        generated_sequence_ids.pop()

    full_output_ids = list(generated_sequence_ids)


    for _ in range(max_generate_length):
        current_input_ids = list(full_output_ids) # Use the growing full_output_ids
        if len(current_input_ids) >= max_seq: # If sequence gets too long for positional embedding
            current_input_ids = current_input_ids[-(max_seq - 1):] # Keep the most recent (max_seq-1) tokens

        current_tokens_tensor = tf.constant([current_input_ids], dtype=tf.int32)

        # Create padding mask for the current_tokens_tensor.
        # This mask will be all zeros if current_tokens_tensor has no <pad> tokens.
        padding_mask = create_padding_mask(current_tokens_tensor)

        # Get model predictions (logits)
        predictions = model(
            current_tokens_tensor,
            speaker_id_tensor,
            attention_mask=padding_mask,
            training=False
        )

        predicted_logits_for_next_token = predictions[:, -1, :]

        # Use argmax for greedy decoding
        predicted_id = tf.argmax(predicted_logits_for_next_token, axis=-1).numpy()[0]

        full_output_ids.append(predicted_id)

        # if predicted_id == mapping_token_to_index["<eos>"]:
            # break # Stop if <eos> is generated

    # Decode the generated token IDs to text
    # full_output_ids contains prompt + generated part
    decoded_tokens_list = index_to_token(full_output_ids)

    # Clean up for display:
    # Remove initial <bos> if the prompt started with it (token_to_index adds it)
    if decoded_tokens_list and decoded_tokens_list[0] == "<bos>":
        decoded_tokens_list = decoded_tokens_list[1:]

    # Join and replace special tokens
    final_sentence = " ".join(decoded_tokens_list)
    final_sentence = final_sentence.replace(" <eos>", ".").replace("<eos>", ".") # Handle EOS at the very end
    final_sentence = final_sentence.replace(" <unk>", " [UNK]").replace("<unk>", "[UNK]")
    final_sentence = final_sentence.replace(" <pad>", "").replace("<pad>", "") # Remove any pad tokens

    return final_sentence.strip()

In [134]:
speaker = "Chandler"
generated_line = generate("Ok",speaker, 60)
print(f"{speaker}: {generated_line}")

Chandler: ok.. i just i was the time i <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos> <bos>
