In [1]:
import shutil
import numpy as np
import pandas as pd
import ast
import re
from collections import Counter
from sklearn.model_selection import train_test_split
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2025-05-14 06:07:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-14 06:07:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-14 06:07:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [3]:
shutil.copytree("/kaggle/input/dailydialog-unlock-the-conversation-potential-in","/kaggle/working/dailydialog-unlock-the-conversation-potential-in",dirs_exist_ok=True)

'/kaggle/working/dailydialog-unlock-the-conversation-potential-in'

In [4]:
train_df = pd.read_csv("/kaggle/working/dailydialog-unlock-the-conversation-potential-in/train.csv")
validation_df = pd.read_csv("/kaggle/working/dailydialog-unlock-the-conversation-potential-in/validation.csv")
test_df = pd.read_csv("/kaggle/working/dailydialog-unlock-the-conversation-potential-in/test.csv")

In [5]:
train_df.head(5)

Unnamed: 0,dialog,act,emotion
0,"['Say , Jim , how about going for a few beers ...",[3 4 2 2 2 3 4 1 3 4],[0 0 0 0 0 0 4 4 4 4]
1,"['Can you do push-ups ? '\n "" Of course I can ...",[2 1 2 2 1 1],[0 0 6 0 0 0]
2,"['Can you study with the radio on ? '\n ' No ,...",[2 1 2 1 1],[0 0 0 0 0]
3,['Are you all right ? '\n ' I will be all righ...,[2 1 1 1],[0 0 0 0]
4,"['Hey John , nice skates . Are they new ? '\n ...",[2 1 2 1 1 2 1 3 4],[0 0 0 0 0 6 0 6 0]


In [6]:
train_df["dialog"][0]

'[\'Say , Jim , how about going for a few beers after dinner ? \'\n \' You know that is tempting but is really not good for our fitness . \'\n \' What do you mean ? It will help us to relax . \'\n " Do you really think so ? I don\'t . It will just make us fat and act silly . Remember last time ? "\n " I guess you are right.But what shall we do ? I don\'t feel like sitting at home . "\n \' I suggest a walk over to the gym where we can play singsong and meet some of our friends . \'\n " That\'s a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . "\n \' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . \'\n " Good.Let \' s go now . " \' All right . \']'

In [None]:
def clean_and_split_dialog(raw_dialog):
    raw_dialog = raw_dialog.strip()[1:-1]
    raw_dialog = raw_dialog.replace("\\n", "|")
    raw_dialog = raw_dialog.replace("\n", "|")
    raw_dialog = re.sub(r"\\'", "'", raw_dialog) 
    raw_dialog = re.sub(r"['\"]", "", raw_dialog)
    utterances = [utt.strip() for utt in raw_dialog.split("|") if utt.strip()]
    return utterances

# Apply to entire column
train_df['utterances'] = train_df['dialog'].apply(clean_and_split_dialog)
print("Example split dialog:\n", train_df['utterances'][0])

# Build input-target pairs (utterance-level)
input_texts = []
target_texts = []

for utts in train_df['utterances']:
    for i in range(len(utts) - 1):
        input_texts.append(utts[i])
        target_texts.append(utts[i + 1])

# Sanity check
print(f"Total Pairs: {len(input_texts)}")
for i in range(3):
    print(f"[Input]  {input_texts[i]}")
    print(f"[Target] {target_texts[i]}\n")


Example split dialog:
 ['Say , Jim , how about going for a few beers after dinner ?', 'You know that is tempting but is really not good for our fitness .', 'What do you mean ? It will help us to relax .', 'Do you really think so ? I dont . It will just make us fat and act silly . Remember last time ?', 'I guess you are right.But what shall we do ? I dont feel like sitting at home .', 'I suggest a walk over to the gym where we can play singsong and meet some of our friends .', 'Thats a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them .', 'Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too .', 'Good.Let  s go now .   All right .']
Total Pairs: 64998
[Input]  Say , Jim , how about going for a few beers after dinner ?
[Target] You know that is tempting but is really not good for our fitness .

[Input]  You know that is tempting but is really not good for our fi

In [8]:

# Tokenize function
def tokenize(sentence):
    return sentence.lower().split()

# Build vocabulary
counter = Counter()
for text in input_texts + target_texts:
    counter.update(tokenize(text))

# Special tokens
special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
word2idx = {tok: idx for idx, tok in enumerate(special_tokens)}
idx2word = special_tokens.copy()

# Add from counter
for word, _ in counter.items():
    if word not in word2idx:
        idx = len(word2idx)
        word2idx[word] = idx
        idx2word.append(word)

vocab_size = len(word2idx)


In [9]:
def sentence_to_indices(sentence, word2idx):
    tokens = tokenize(sentence)
    return [word2idx.get('<SOS>')] + [word2idx.get(tok, word2idx['<UNK>']) for tok in tokens] + [word2idx.get('<EOS>')]


In [10]:
# Convert all sentences
input_seqs = [torch.tensor(sentence_to_indices(sent, word2idx)) for sent in input_texts]
target_seqs = [torch.tensor(sentence_to_indices(sent, word2idx)) for sent in target_texts]


In [11]:

max_len = 20  

input_seqs_padded = pad_sequence(
    [s[:max_len] if len(s) > max_len else torch.cat([s, torch.tensor([word2idx['<PAD>']] * (max_len - len(s)))]) for s in input_seqs],
    batch_first=True
)

target_seqs_padded = pad_sequence(
    [s[:max_len] if len(s) > max_len else torch.cat([s, torch.tensor([word2idx['<PAD>']] * (max_len - len(s)))]) for s in target_seqs],
    batch_first=True
)


In [12]:
print(f"Vocab size: {len(word2idx)}")
print("Sample vocab entries:", list(word2idx.items())[:20])


Vocab size: 21856
Sample vocab entries: [('<PAD>', 0), ('<SOS>', 1), ('<EOS>', 2), ('<UNK>', 3), ('say', 4), (',', 5), ('jim', 6), ('how', 7), ('about', 8), ('going', 9), ('for', 10), ('a', 11), ('few', 12), ('beers', 13), ('after', 14), ('dinner', 15), ('?', 16), ('you', 17), ('know', 18), ('that', 19)]


In [13]:
sample_text = input_texts[0]
print("Original text:", sample_text)
print("Tokens:", tokenize(sample_text))
print("Token indices:", sentence_to_indices(sample_text, word2idx))


Original text: Say , Jim , how about going for a few beers after dinner ?
Tokens: ['say', ',', 'jim', ',', 'how', 'about', 'going', 'for', 'a', 'few', 'beers', 'after', 'dinner', '?']
Token indices: [1, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 2]


In [14]:
print("Input tensor shape:", input_seqs_padded.shape)
print("Target tensor shape:", target_seqs_padded.shape)


Input tensor shape: torch.Size([64998, 20])
Target tensor shape: torch.Size([64998, 20])


In [15]:
def decode(indices, idx2word):
    return ' '.join([idx2word[idx] for idx in indices if idx2word[idx] not in ['<PAD>', '<SOS>', '<EOS>']])

print("Decoded input:", decode(input_seqs_padded[0].tolist(), idx2word))
print("Decoded target:", decode(target_seqs_padded[0].tolist(), idx2word))


Decoded input: say , jim , how about going for a few beers after dinner ?
Decoded target: you know that is tempting but is really not good for our fitness .


In [16]:

class DialogDataset(Dataset):
    def __init__(self, input_tensor, target_tensor):
        self.inputs = input_tensor
        self.targets = target_tensor

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]


In [None]:


dataset = DialogDataset(input_seqs_padded, target_seqs_padded)

BATCH_SIZE = 256

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)


In [None]:
# Test one batch
for batch in dataloader:
    src, tgt = batch
    print("Source shape:", src.shape)
    print("Target shape:", tgt.shape)
    print("Sample decoded input:", decode(src[0].tolist(), idx2word))
    print("Sample decoded target:", decode(tgt[0].tolist(), idx2word))
    break


Source shape: torch.Size([256, 20])
Target shape: torch.Size([256, 20])
Sample decoded input: excuse me , sir . i cant find my baggage . here is my claim tag .
Sample decoded target: dont worry , madam . can you make a description of your baggage ?


In [19]:
validation_df['utterances'] = validation_df['dialog'].apply(clean_and_split_dialog)
test_df['utterances'] = test_df['dialog'].apply(clean_and_split_dialog)


In [20]:
def build_pairs(dialogs):
    input_texts, target_texts = [], []
    for utts in dialogs:
        for i in range(len(utts) - 1):
            input_texts.append(utts[i])
            target_texts.append(utts[i + 1])
    return input_texts, target_texts

val_input_texts, val_target_texts = build_pairs(validation_df['utterances'])
test_input_texts, test_target_texts = build_pairs(test_df['utterances'])


In [21]:
val_input_seqs = [torch.tensor(sentence_to_indices(sent, word2idx)) for sent in val_input_texts]
val_target_seqs = [torch.tensor(sentence_to_indices(sent, word2idx)) for sent in val_target_texts]

test_input_seqs = [torch.tensor(sentence_to_indices(sent, word2idx)) for sent in test_input_texts]
test_target_seqs = [torch.tensor(sentence_to_indices(sent, word2idx)) for sent in test_target_texts]


In [22]:
def pad_sequence_list(seqs, max_len):
    return pad_sequence(
        [s[:max_len] if len(s) > max_len else torch.cat([s, torch.tensor([word2idx['<PAD>']] * (max_len - len(s)))]) for s in seqs],
        batch_first=True
    )

val_input_padded = pad_sequence_list(val_input_seqs, max_len)
val_target_padded = pad_sequence_list(val_target_seqs, max_len)

test_input_padded = pad_sequence_list(test_input_seqs, max_len)
test_target_padded = pad_sequence_list(test_target_seqs, max_len)


In [23]:
val_dataset = DialogDataset(val_input_padded, val_target_padded)
test_dataset = DialogDataset(test_input_padded, test_target_padded)

val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
# Load pretrained GloVe vectors (100D)
embedding_dim = 100
glove_path = "/kaggle/working/glove.6B.100d.txt"

glove = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vec = np.array(parts[1:], dtype=np.float32)
        glove[word] = vec

embedding_matrix = np.zeros((len(word2idx), embedding_dim))
for word, idx in word2idx.items():
    embedding_matrix[idx] = glove.get(word, np.random.normal(scale=0.6, size=embedding_dim))

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=word2idx['<PAD>'])


In [25]:
class Encoder(nn.Module):
    def __init__(self, embedding_layer, hidden_size=512, dropout=0.3, num_layers=2):
        super().__init__()
        self.embedding = embedding_layer
        self.lstm = nn.LSTM(embedding_layer.embedding_dim, hidden_size, num_layers=num_layers,
                            batch_first=True, bidirectional=True, dropout=dropout)

    def forward(self, src, src_lengths):
        embedded = self.embedding(src)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, (hidden, cell) = self.lstm(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden, cell


In [26]:
class BahdanauAttention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super().__init__()
        self.attn = nn.Linear(enc_hidden_size * 2 + dec_hidden_size, dec_hidden_size)
        self.v = nn.Linear(dec_hidden_size, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs, mask):
        batch_size, src_len, _ = encoder_outputs.shape
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((decoder_hidden, encoder_outputs), dim=2)))
        scores = self.v(energy).squeeze(2)
        scores = scores.masked_fill(mask == 0, -1e10)
        return torch.softmax(scores, dim=1)


In [27]:
class Decoder(nn.Module):
    def __init__(self, embedding_layer, enc_hidden_size, dec_hidden_size, attention, dropout=0.3, num_layers=1):
        super().__init__()
        self.embedding = embedding_layer
        self.attention = attention
        self.dropout = nn.Dropout(dropout)

        self.lstm = nn.LSTM(embedding_layer.embedding_dim + enc_hidden_size * 2,
                            dec_hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        
        self.fc_out = nn.Linear(enc_hidden_size * 2 + dec_hidden_size + embedding_layer.embedding_dim,
                                len(word2idx))

    def forward(self, input_token, hidden, cell, encoder_outputs, mask):
        input_token = input_token.unsqueeze(1)
        embedded = self.embedding(input_token)
        attn_weights = self.attention(hidden[-1], encoder_outputs, mask)
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)

        lstm_input = torch.cat((embedded, attn_applied), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))

        output = output.squeeze(1)
        attn_applied = attn_applied.squeeze(1)
        embedded = embedded.squeeze(1)

        prediction = self.fc_out(self.dropout(torch.cat((output, attn_applied, embedded), dim=1)))
        return prediction, hidden, cell, attn_weights


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device

        enc_hidden_size = encoder.lstm.hidden_size
        dec_hidden_size = decoder.lstm.hidden_size

 
        self.project_hidden = nn.Linear(enc_hidden_size * 2, dec_hidden_size)
        self.project_cell = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def create_mask(self, src):
        return (src != self.pad_idx).to(self.device)  # [B, S]

    def forward(self, src, src_lengths, trg, teacher_forcing_ratio=1.0):
        batch_size, trg_len = trg.shape
        vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)

        # Encoder forward
        encoder_outputs, enc_hidden, enc_cell = self.encoder(src, src_lengths)
        mask = self.create_mask(src)  # [B, S]

        # Process bidirectional encoder states
        def cat_and_project(state, proj_layer):
            # Take last layer forward and backward: [-2] and [-1]
            cat = torch.cat((state[-2], state[-1]), dim=1)  # [B, 2*H]
            return torch.tanh(proj_layer(cat)).unsqueeze(0)  # [1, B, H]

        hidden = cat_and_project(enc_hidden, self.project_hidden)
        cell = cat_and_project(enc_cell, self.project_cell)

        # First input is <SOS>
        input_token = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell, _ = self.decoder(input_token, hidden, cell, encoder_outputs, mask)
            outputs[:, t] = output

            # Scheduled teacher forcing
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs


In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

enc_hidden_size = 512
dec_hidden_size = 512
attention = BahdanauAttention(enc_hidden_size, dec_hidden_size)

encoder = Encoder(embedding_layer, hidden_size=enc_hidden_size)
decoder = Decoder(embedding_layer, enc_hidden_size, dec_hidden_size, attention)
model = Seq2Seq(encoder, decoder, word2idx['<PAD>'], device).to(device)




In [30]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1, ignore_index=word2idx['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=1)


In [None]:

def train(model, dataloader, optimizer, criterion, clip, tf_ratio):
    model.train()
    total_loss = 0

    for src, trg in tqdm(dataloader, desc="Training"):
        src, trg = src.to(device), trg.to(device)

        # Lengths (assuming padding is at the end)
        src_lengths = (src != word2idx['<PAD>']).sum(dim=1)

        optimizer.zero_grad()
        output = model(src, src_lengths, trg, teacher_forcing_ratio=tf_ratio)

        output = output[:, 1:].reshape(-1, output.shape[-1])
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [32]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for src, trg in tqdm(dataloader, desc="Evaluating"):
            src, trg = src.to(device), trg.to(device)
            src_lengths = (src != word2idx['<PAD>']).sum(dim=1)

            output = model(src, src_lengths, trg, teacher_forcing_ratio=0.0)
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            total_loss += loss.item()

    return total_loss / len(dataloader)


In [33]:
NUM_EPOCHS = 20
clip = 1.0
start_tf = 1.0
tf_decay = 0.05
best_val_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    tf_ratio = max(0.5, start_tf - tf_decay * epoch)

    print(f"\nEpoch {epoch+1} | Teacher Forcing: {tf_ratio:.2f}")
    train_loss = train(model, dataloader, optimizer, criterion, clip, tf_ratio)
    val_loss = evaluate(model, val_loader, criterion)

    scheduler.step(val_loss)

    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_seq2seq_model.pt")



Epoch 1 | Teacher Forcing: 1.00


Training: 100%|██████████| 254/254 [02:54<00:00,  1.45it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.45it/s]


Train Loss: 5.8562 | Val Loss: 7.7112

Epoch 2 | Teacher Forcing: 0.95


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.42it/s]


Train Loss: 5.2756 | Val Loss: 7.5846

Epoch 3 | Teacher Forcing: 0.90


Training: 100%|██████████| 254/254 [02:54<00:00,  1.45it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.44it/s]


Train Loss: 5.1516 | Val Loss: 7.3945

Epoch 4 | Teacher Forcing: 0.85


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.44it/s]


Train Loss: 5.1160 | Val Loss: 7.2864

Epoch 5 | Teacher Forcing: 0.80


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.42it/s]


Train Loss: 5.0860 | Val Loss: 7.0044

Epoch 6 | Teacher Forcing: 0.75


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:10<00:00,  2.40it/s]


Train Loss: 5.0830 | Val Loss: 6.8232

Epoch 7 | Teacher Forcing: 0.70


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:10<00:00,  2.37it/s]


Train Loss: 5.0843 | Val Loss: 6.7596

Epoch 8 | Teacher Forcing: 0.65


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.45it/s]


Train Loss: 5.0504 | Val Loss: 6.5758

Epoch 9 | Teacher Forcing: 0.60


Training: 100%|██████████| 254/254 [02:54<00:00,  1.45it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.43it/s]


Train Loss: 5.0321 | Val Loss: 6.5727

Epoch 10 | Teacher Forcing: 0.55


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.42it/s]


Train Loss: 5.0316 | Val Loss: 6.3381

Epoch 11 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.43it/s]


Train Loss: 4.9937 | Val Loss: 6.3483

Epoch 12 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:53<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.44it/s]


Train Loss: 4.8399 | Val Loss: 6.3954

Epoch 13 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:53<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:10<00:00,  2.38it/s]


Train Loss: 4.6270 | Val Loss: 6.3064

Epoch 14 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:53<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.42it/s]


Train Loss: 4.5349 | Val Loss: 6.3396

Epoch 15 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.44it/s]


Train Loss: 4.3925 | Val Loss: 6.3839

Epoch 16 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.43it/s]


Train Loss: 4.2904 | Val Loss: 6.3757

Epoch 17 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.43it/s]


Train Loss: 4.2442 | Val Loss: 6.3930

Epoch 18 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.43it/s]


Train Loss: 4.1262 | Val Loss: 6.4172

Epoch 19 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.45it/s]
Evaluating: 100%|██████████| 24/24 [00:10<00:00,  2.38it/s]


Train Loss: 4.1218 | Val Loss: 6.4130

Epoch 20 | Teacher Forcing: 0.50


Training: 100%|██████████| 254/254 [02:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 24/24 [00:10<00:00,  2.39it/s]

Train Loss: 4.0725 | Val Loss: 6.4302





In [34]:
test_loss = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}")

Evaluating: 100%|██████████| 23/23 [00:09<00:00,  2.45it/s]

Test Loss: 6.3844





In [35]:
def generate_response_beam(model, sentence, beam_width=3, max_len=20):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor(sentence_to_indices(sentence, word2idx)).unsqueeze(0).to(device)
        src_lengths = torch.tensor([input_tensor.shape[1]])
        encoder_outputs, hidden, cell = model.encoder(input_tensor, src_lengths)
        mask = model.create_mask(input_tensor)

        reduced_hidden = torch.tanh(model.project_hidden(torch.cat((hidden[-2], hidden[-1]), dim=1)))
        reduced_cell   = torch.tanh(model.project_cell(torch.cat((cell[-2], cell[-1]), dim=1)))
        hidden = reduced_hidden.unsqueeze(0).repeat(model.decoder.lstm.num_layers, 1, 1)
        cell   = reduced_cell.unsqueeze(0).repeat(model.decoder.lstm.num_layers, 1, 1)

        # Beam state: (tokens, score, hidden, cell)
        beams = [([word2idx['<SOS>']], 0.0, hidden, cell)]
        completed = []

        for _ in range(max_len):
            new_beams = []
            for tokens, score, h, c in beams:
                input_token = torch.tensor([tokens[-1]], device=device)
                output, h_new, c_new, _ = model.decoder(input_token, h, c, encoder_outputs, mask)
                log_probs = torch.log_softmax(output, dim=1).squeeze(0)

                topk = torch.topk(log_probs, beam_width)
                for idx, log_prob in zip(topk.indices.tolist(), topk.values.tolist()):
                    new_seq = tokens + [idx]
                    new_score = score + log_prob
                    if idx == word2idx['<EOS>']:
                        completed.append((new_seq, new_score))
                    else:
                        new_beams.append((new_seq, new_score, h_new, c_new))

            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
            if not beams:
                break

        if not completed:
            completed = beams

        best_seq = sorted(completed, key=lambda x: x[1], reverse=True)[0][0]
        return decode(best_seq[1:], idx2word)  # skip <SOS>


In [37]:
test_sentences = [
    "Hi , how are you ?",
    "What do you want to do tonight ?",
    "Let's go to the park .",
    "Are you hungry ?"
]

for sent in test_sentences:
    response = generate_response_beam(model, sent)
    print(f"> User: {sent}")
    print(f"> Bot : {response}\n")


> User: Hi , how are you ?
> Bot : fine , thank you . i am trying to get adjusted .

> User: What do you want to do tonight ?
> Bot : i have no idea what i want .

> User: Let's go to the park .
> Bot : do you want to go to the park ?

> User: Are you hungry ?
> Bot : yes , i have a lot of my friends .



### Conclusion

- as shown the resualts are not human like but demonstrate the model's ability to learn and generate coherent responses.
- this is a proof of concept, and the model can be further improved and fine-tuned to improve its performance on specific tasks.
- the model's poor preformance an be atributed to the limited training data and the simplicity of the model architecture.
