In [46]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [47]:
script = pd.read_csv('/content/Friends_script.csv').dropna()
print(script.head())
print(script.isnull().sum())

       Name                                              Lines
0    Monica  There's nothing to tell! He's just some guy I ...
1      Joey  C'mon, you're going out with the guy! There's ...
2  Chandler  All right Joey, be nice. So does he have a hum...
3    Phoebe                           Wait, does he eat chalk?
4    Phoebe  Just, 'cause, I don't want her to go through w...
Name     0
Lines    0
dtype: int64


In [48]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("bert-base-cased")
def tokenize(script):
    print(tokenizer.encode(script['Lines']).ids)
tokenize(script.loc[0])

[101, 1247, 112, 188, 1720, 1106, 1587, 106, 1124, 112, 188, 1198, 1199, 2564, 146, 1250, 1114, 106, 102]


In [49]:
vocab = tokenizer.get_vocab()
def sentence_to_token(sentence):
    return tokenizer.encode(sentence, add_special_tokens=False)
def token_to_sentence(tokens):
    return tokenizer.decode(tokens, skip_special_tokens=True)


print("Vocabulary size:", len(vocab))

text = "Hi I am Ross."
encoded = sentence_to_token(text)
decoded = token_to_sentence(encoded.ids)

print(f"Tokens for '{text}':", encoded.tokens)
print("Decoded tokens:", decoded)

Vocabulary size: 28996
Tokens for 'Hi I am Ross.': ['Hi', 'I', 'am', 'Ross', '.']
Decoded tokens: Hi I am Ross.


In [50]:
@tf.keras.utils.register_keras_serializable()
class CombinedEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, num_characters,  max_seq_len, embedding_dim, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.num_characters = num_characters
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim

        self.token_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.speaker_embedding = tf.keras.layers.Embedding(num_characters, embedding_dim)
        self.position_embedding = tf.keras.layers.Embedding(max_seq_len, embedding_dim)

    def call(self, token_ids, speaker_ids):
        seq_len = tf.shape(token_ids)[1]
        batch_size = tf.shape(token_ids)[0]

        token_embed = self.token_embedding(token_ids)

        speaker_embed = self.speaker_embedding(speaker_ids)
        speaker_embed = tf.expand_dims(speaker_embed, axis=1)
        speaker_embed = tf.tile(speaker_embed, [1, seq_len, 1])

        positions = tf.range(start=0, limit=seq_len, delta=1)
        position_embed = self.position_embedding(positions)
        position_embed = tf.expand_dims(position_embed, axis=0)
        position_embed = tf.tile(position_embed, [batch_size, 1, 1])

        return token_embed + speaker_embed + position_embed

    def get_config(self):
        config = super().get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'num_characters': self.num_characters,
            'max_seq_len': self.max_seq_len,
            'embedding_dim': self.embedding_dim,
        })
        return config

In [51]:
unique_speakers = sorted(set(script['Name']))

In [52]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

speaker_to_index = {name: i for i, name in enumerate(unique_speakers)}
tokenized_lines = []
speaker_indices = []
max_seq = 0

for i in range(len(script)):
    line = script['Lines'].iloc[i]
    name = script['Name'].iloc[i]
    line_encode = sentence_to_token(line).ids
    tokenized_lines.append(line_encode)
    if name in speaker_to_index:
        speaker_idx = speaker_to_index[name]
        speaker_indices.append(speaker_idx)
    else:
        speaker_indices.append(0)
    max_seq = max(len(line_encode), max_seq)

token_ids_padded = pad_sequences(tokenized_lines, maxlen=max_seq, padding='post')

token_ids_tensor = tf.constant(token_ids_padded, dtype=tf.int32)
speaker_ids_tensor = tf.constant(speaker_indices, dtype=tf.int32)

In [53]:
max_seq

270

In [54]:
token_ids_padded[0]

array([1247,  112,  188, 1720, 1106, 1587,  106, 1124,  112,  188, 1198,
       1199, 2564,  146, 1250, 1114,  106,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [55]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq,0), tf.int32) # Use "<pad>" index
    # Add extra dimensions to add the padding to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [56]:
tf.shape(token_ids_tensor)
tf.shape(speaker_ids_tensor)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([55267], dtype=int32)>

In [57]:
# emb = CombinedEmbedding(len(vocab),len(unique_speakers),246,256)
# output = emb(token_ids_tensor, speaker_ids_tensor)

In [58]:
@tf.keras.utils.register_keras_serializable()
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, MHA_num_heads, emb_dim, ff_dim, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.MHA_num_heads = MHA_num_heads
        self.emb_dim = emb_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate

        self.MHA = tf.keras.layers.MultiHeadAttention(
            num_heads=MHA_num_heads,
            key_dim=emb_dim // MHA_num_heads,
            output_shape=emb_dim
        )
        self.ff = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dense(emb_dim)
        ])
        self.layernom1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernom2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training, attention_mask=None):
        tf.debugging.assert_shapes([(x, ['batch', 'seq', 'emb_dim'])], message="Input shape to DecoderBlock is incorrect")

        attention_output = self.MHA(query=x, value=x, key=x, attention_mask=attention_mask, training=training)
        attention_output = self.dropout1(attention_output, training=training)
        out1 = self.layernom1(x + attention_output)

        ff_output = self.ff(out1)
        ff_output = self.dropout2(ff_output, training=training)
        out2 = self.layernom2(out1 + ff_output)

        return out2

    def get_config(self):
        config = super().get_config()
        config.update({
            'MHA_num_heads': self.MHA_num_heads,
            'emb_dim': self.emb_dim,
            'ff_dim': self.ff_dim,
            'dropout_rate': self.dropout_rate,
        })
        return config


@tf.keras.utils.register_keras_serializable()
class DecoderTransformer(tf.keras.Model):
    def __init__(self, vocab_size, num_characters, max_seq, embedding_dim, MHA_num_heads, ff_dim, num_decoder_blocks, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.num_characters = num_characters
        self.max_seq = max_seq
        self.embedding_dim = embedding_dim
        self.MHA_num_heads = MHA_num_heads
        self.ff_dim = ff_dim
        self.num_decoder_blocks = num_decoder_blocks
        self.dropout_rate = dropout_rate

        self.embedding = CombinedEmbedding(vocab_size, num_characters, max_seq, embedding_dim)
        self.decoder_blocks = [DecoderBlock(MHA_num_heads, embedding_dim, ff_dim, dropout_rate) for _ in range(num_decoder_blocks)]
        self.final_dense = tf.keras.layers.Dense(vocab_size)

    def call(self, token_ids_tensor, speaker_ids_tensor, attention_mask=None, training=None):
        x = self.embedding(token_ids_tensor, speaker_ids_tensor)
        tf.debugging.assert_shapes([(x, ['batch', 'seq', 'emb_dim'])], message="Shape after embedding is incorrect")

        for i, decoder_block in enumerate(self.decoder_blocks):
            x = decoder_block(x, training=training, attention_mask=attention_mask)
            tf.debugging.assert_shapes([(x, ['batch', 'seq', 'emb_dim'])], message=f"Shape after decoder block {i} is incorrect")

        logits = self.final_dense(x)
        return logits

    def get_config(self):
        config = super().get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'num_characters': self.num_characters,
            'max_seq': self.max_seq,
            'embedding_dim': self.embedding_dim,
            'MHA_num_heads': self.MHA_num_heads,
            'ff_dim': self.ff_dim,
            'num_decoder_blocks': self.num_decoder_blocks,
            'dropout_rate': self.dropout_rate,
        })
        return config

In [59]:
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import numpy as np

batch_size = 16

train_token_ids, val_token_ids, train_speaker_ids, val_speaker_ids = train_test_split(
    token_ids_tensor.numpy(), speaker_ids_tensor.numpy(), test_size=0.1, random_state=42
)

train_dataset = tf.data.Dataset.from_tensor_slices((train_token_ids, train_speaker_ids)).shuffle(10000).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((val_token_ids, val_speaker_ids)).batch(batch_size)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=1000,
    decay_rate=0.95
)
optimizer = Adam(learning_rate=lr_schedule)

model = DecoderTransformer(
    vocab_size=len(vocab),
    num_characters=len(unique_speakers),
    max_seq=max_seq,
    embedding_dim=256,
    MHA_num_heads=8,
    ff_dim=1024,
    num_decoder_blocks=3,
    dropout_rate=0.1
)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [60]:
num_epochs = 5
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    total_train_loss = []

    for step, (batch_token_ids, batch_speaker_ids) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            input_ids = batch_token_ids[:, :-1]
            target_ids = batch_token_ids[:, 1:]
            mask = create_padding_mask(input_ids)

            logits = model(input_ids, batch_speaker_ids, attention_mask=mask, training=True)
            loss = loss_fn(target_ids, logits)

        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        total_train_loss.append(loss.numpy())

        if step % 100 == 0:
            print(f"Step {step}, Loss: {loss.numpy():.4f}")


    total_val_loss = []
    for val_tokens, val_speakers in val_dataset:
        input_ids = val_tokens[:, :-1]
        target_ids = val_tokens[:, 1:]
        mask = create_padding_mask(input_ids)

        val_logits = model(input_ids, val_speakers, attention_mask=mask, training=False)
        val_loss = loss_fn(target_ids, val_logits)
        total_val_loss.append(val_loss.numpy())

    avg_train_loss = np.mean(total_train_loss)
    avg_val_loss = np.mean(total_val_loss)

    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        model.save_weights("best_model_weights.h5")
        print("Saved best model weights.")
    else:
        patience_counter += 1
        if patience_counter >= 2:
            print("Early stopping triggered.")
            break


Epoch 1/5
Step 0, Loss: 10.2969
Step 100, Loss: 0.2104
Step 200, Loss: 0.4081
Step 300, Loss: 0.1899
Step 400, Loss: 0.2715
Step 500, Loss: 0.2112
Step 600, Loss: 0.2668
Step 700, Loss: 0.2560
Step 800, Loss: 0.2480
Step 900, Loss: 0.1477
Step 1000, Loss: 0.2477
Step 1100, Loss: 0.2470
Step 1200, Loss: 0.2165
Step 1300, Loss: 0.3155
Step 1400, Loss: 0.2399
Step 1500, Loss: 0.2420
Step 1600, Loss: 0.1391
Step 1700, Loss: 0.2807
Step 1800, Loss: 0.2698
Step 1900, Loss: 0.2526
Step 2000, Loss: 0.2546


KeyboardInterrupt: 

In [None]:
model.save('decoder_transformer_model.keras')

In [None]:
# model = tf.keras.models.load_model('model.keras')

In [61]:
import numpy as np
def generate(start_string, character_name, max_generate_length):
    if character_name not in speaker_to_index:
        speaker_idx = 0
    else:
        speaker_idx = speaker_to_index[character_name]

    speaker_id_tensor = tf.constant([speaker_idx], dtype=tf.int32)
    input_tokens = sentence_to_token(start_string).ids
    generated_sequence_ids = list(input_tokens)
    full_output_ids = list(generated_sequence_ids)

    for _ in range(max_generate_length):
        current_input_ids = list(full_output_ids)
        if len(current_input_ids) >= max_seq:
            current_input_ids = current_input_ids[-(max_seq - 1):]

        current_tokens_tensor = tf.constant([current_input_ids], dtype=tf.int32)
        padding_mask = create_padding_mask(current_tokens_tensor)

        predictions = model(
            current_tokens_tensor,
            speaker_id_tensor,
            attention_mask=padding_mask,
            training=False
        )

        predicted_logits_for_next_token = predictions[:, -1, :]
        predicted_id = tf.random.categorical(predicted_logits_for_next_token / 0.9, num_samples=1).numpy()[0][0]
        full_output_ids.append(predicted_id)

    final_sentence = token_to_sentence(full_output_ids)
    return final_sentence.strip()


In [62]:
speaker = "Phoebe"
generated_line = generate("Hi ", speaker , max_generate_length=30)
print(f"{speaker}: {generated_line}")

Phoebe: Hi! I, get back!
