In [76]:
import os
import torch
import torch.nn as nn
from tokenizers import ByteLevelBPETokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import math
import torch.optim as optim
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

In [77]:
data_path = os.path.abspath("hafez.txt")
vocab_dir = "hafez_vocab"

assert os.path.isfile(data_path), f"{data_path} not found. Please upload the file."

with open(data_path, "r", encoding="utf-8") as file:
    text = file.read()

len(text)

276324

In [78]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[data_path], vocab_size=30000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
    "<lb>"
])






In [79]:
# Save the tokenizer model
if not os.path.exists(vocab_dir):
    os.makedirs(vocab_dir)
tokenizer.save_model(vocab_dir)

['hafez_vocab/vocab.json', 'hafez_vocab/merges.txt']

In [80]:
from tokenizers.implementations import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(
    os.path.join(vocab_dir, "vocab.json"),
    os.path.join(vocab_dir, "merges.txt"),
)

In [81]:
# Tokenize a sample Persian text
sample_text = """حافظ شب هجران شد بوی خوش وصل آمد
شاديت مبارک باد ای عاشق شيدايی
ای دل گر از آن چاه زنخدان به درآيی
هر جا که روی زود پشيمان به درآيی
هش دار که گر وسوسه عقل کنی گوش
آدم صفت از روضه رضوان به درآيی"""

# Replace line breaks with <lb>
sample_text = sample_text.replace("\n", " <lb> ")

encoded = tokenizer.encode(sample_text)
print(f"Tokens: {encoded.tokens}")
print(f"Token IDs: {encoded.ids}")

# Decode the token IDs back to text
decoded_text = tokenizer.decode(encoded.ids)
print(f"Decoded Text: {decoded_text}")

Tokens: ['ØŃØ§ÙģØ¸', 'ĠØ´Ø¨', 'ĠÙĩØ¬Ø±Ø§ÙĨ', 'ĠØ´Ø¯', 'ĠØ¨ÙĪÛĮ', 'ĠØ®ÙĪØ´', 'ĠÙĪØµÙĦ', 'ĠØ¢ÙħØ¯', 'Ġ', '<', 'l', 'b', '>', 'ĠØ´Ø§Ø¯', 'ÙĬØª', 'ĠÙħØ¨Ø§Ø±Ú©', 'ĠØ¨Ø§Ø¯', 'ĠØ§ÛĮ', 'ĠØ¹Ø§Ø´ÙĤ', 'ĠØ´ÙĬØ¯Ø§ÙĬÛĮ', 'Ġ', '<', 'l', 'b', '>', 'ĠØ§ÛĮ', 'ĠØ¯ÙĦ', 'ĠÚ¯Ø±', 'ĠØ§Ø²', 'ĠØ¢ÙĨ', 'ĠÚĨØ§Ùĩ', 'ĠØ²ÙĨØ®Ø¯Ø§ÙĨ', 'ĠØ¨Ùĩ', 'ĠØ¯Ø±Ø¢ÙĬÛĮ', 'Ġ', '<', 'l', 'b', '>', 'ĠÙĩØ±', 'ĠØ¬Ø§', 'ĠÚ©Ùĩ', 'ĠØ±ÙĪÛĮ', 'ĠØ²ÙĪØ¯', 'ĠÙ¾Ø´ÙĬÙħØ§ÙĨ', 'ĠØ¨Ùĩ', 'ĠØ¯Ø±Ø¢ÙĬÛĮ', 'Ġ', '<', 'l', 'b', '>', 'ĠÙĩØ´', 'ĠØ¯Ø§Ø±', 'ĠÚ©Ùĩ', 'ĠÚ¯Ø±', 'ĠÙĪØ³ÙĪØ³Ùĩ', 'ĠØ¹ÙĤÙĦ', 'ĠÚ©ÙĨÛĮ', 'ĠÚ¯ÙĪØ´', 'Ġ', '<', 'l', 'b', '>', 'ĠØ¢Ø¯Ùħ', 'ĠØµÙģØª', 'ĠØ§Ø²', 'ĠØ±ÙĪØ¶Ùĩ', 'ĠØ±Ø¶ÙĪØ§ÙĨ', 'ĠØ¨Ùĩ', 'ĠØ¯Ø±Ø¢ÙĬÛĮ']
Token IDs: [487, 505, 1212, 403, 683, 420, 789, 508, 226, 33, 81, 71, 35, 1398, 539, 2742, 381, 413, 635, 4768, 226, 33, 81, 71, 35, 413, 338, 399, 311, 353, 1780, 2081, 304, 2074, 226, 33, 81, 71, 35, 418, 668, 296, 467, 2544, 3337, 304, 2074, 226, 33, 81, 71, 35, 2005, 390, 296, 399, 4866, 977, 1144, 547, 226, 33, 81, 71, 35, 1778, 2

In [82]:
def tokenize_sentences(sentences, tokenizer):
    return [tokenizer.encode(sentence).ids for sentence in sentences]

In [83]:
# Split the text into verses using double newlines
verses = text.split("\n\n")

# Replace single newlines within verses with <lb> and remove any empty verses
verses = [verse.replace("\n", " <lb> ") for verse in verses if verse.strip()]

# Tokenize each verse
tokenized_verses = [tokenizer.encode(verse).ids for verse in verses]

# Pad or truncate the verses to have uniform length (max_len)
max_len = 48
padded_tokenized_verses = [verse + [tokenizer.token_to_id('<pad>')] * (max_len - len(verse)) for verse in tokenized_verses]

print(f"Number of padded verses: {len(padded_tokenized_verses)}")
print(f"Example padded verse: {padded_tokenized_verses[0]}")
print(tokenizer.decode(padded_tokenized_verses[0]))

Number of padded verses: 4191
Example padded verse: [177, 125, 129, 2010, 727, 5030, 483, 911, 3038, 6276, 289, 952, 3987, 263, 226, 33, 81, 71, 35, 296, 419, 1833, 1663, 1642, 831, 765, 1826, 307, 545, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
﻿الا يا ايها الساقی ادر کاسا و ناولها <lb> که عشق آسان نمود اول ولی افتاد مشکل‌ها<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [84]:
# Split data into training, validation, and test sets
train_data, test_val_data = train_test_split(padded_tokenized_verses, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_val_data, test_size=0.5, random_state=42)

print(f"Train data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

Train data size: 2933
Validation data size: 629
Test data size: 629


In [85]:
def create_dataloader(data, batch_size):
    input_ids = torch.tensor(data)
    dataset = TensorDataset(input_ids)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

batch_size = 8

train_dataloader = create_dataloader(train_data, batch_size)
val_dataloader = create_dataloader(val_data, batch_size)
test_dataloader = create_dataloader(test_data, batch_size)

print(f"Number of batches in train dataloader: {len(train_dataloader)}")
print(f"Number of batches in val dataloader: {len(val_dataloader)}")
print(f"Number of batches in test dataloader: {len(test_dataloader)}")

Number of batches in train dataloader: 367
Number of batches in val dataloader: 79
Number of batches in test dataloader: 79


In [86]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)  # (max_len, d_model) => (1, max_len, d_model)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :].to(x.device)

class GPT2Embedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super(GPT2Embedding, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_encoding = PositionalEncoding(d_model, max_len)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input_ids):
        token_embeddings = self.token_embedding(input_ids)
        position_embeddings = self.position_encoding(token_embeddings)
        embeddings = self.layer_norm(position_embeddings)
        return embeddings

# B : batch_size, H: nhead, D: d_model, S: seq_len, M: max_len
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super(MultiHeadAttention, self).__init__()
        assert d_model % nhead == 0
        self.d_k = d_model // nhead
        self.nhead = nhead
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.linear_out = nn.Linear(d_model, d_model)
        self.attention = None

    def scaled_dot_product_attention(self, query, key, value, mask=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k) # (B, H, S, d_k) * (B, H, d_k, S) => (B, H, S, S)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = torch.softmax(scores, dim=-1)
        return torch.matmul(p_attn, value), p_attn

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        # query, key, value: (B, S, D) -> (B, S, N, d_K) -> (B, H, S, d_K)
        query = self.linear_q(query).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        key   = self.linear_k(key)  .view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        value = self.linear_v(value).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)

        x, self.attention = self.scaled_dot_product_attention(query, key, value, mask) # x: (B, H, S, d_k)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.nhead * self.d_k)#  x: (B, H, S, d_k) -> (B, S, H, d_k) -> (B, S, D)
        return self.linear_out(x) # (B, S, D)


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))


class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, d_ff=2048):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, nhead)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        # Self-attention
        attn_output = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.layer_norm1(x)

        # Feed-forward network
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.layer_norm2(x)

        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_len, d_ff=2048):
        super(Decoder, self).__init__()
        self.embedding = GPT2Embedding(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead, d_ff) for _ in range(num_layers)])
        self.linear_out = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids, mask=None):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x, mask)
        logits = self.linear_out(x)
        return logits

In [207]:
# Initialize model, criterion, and optimizer
model = Decoder(vocab_size, d_model, nhead, num_layers, max_len, d_ff)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [208]:
# Define hyperparameters
vocab_size = 30000
d_model = 512
nhead = 8
num_layers = 6
max_len = 48
d_ff = 2048
learning_rate = 1e-4

# first model's results:
# Train Loss: 0.002588495343548629
# Val   Loss: 0.2399774716247486
# Test  Loss: 0.22456781816067575

In [None]:
# Training loop with early stopping
num_epochs = 10
early_stopping_patience = 5
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch[0]
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch[0]
            outputs = model(input_ids)
            loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Evaluation on the test set
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
        test_loss += loss.item()

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

In [209]:
# Load the best model
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

Decoder(
  (embedding): GPT2Embedding(
    (token_embedding): Embedding(30000, 512)
    (position_encoding): PositionalEncoding()
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (linear_q): Linear(in_features=512, out_features=512, bias=True)
        (linear_k): Linear(in_features=512, out_features=512, bias=True)
        (linear_v): Linear(in_features=512, out_features=512, bias=True)
        (linear_out): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (relu): ReLU()
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplac

In [None]:
def generate_text(model, tokenizer, input_text, max_length=100, temperature=1.0):
    model.eval()
    input_ids = tokenizer.encode(input_text).ids
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Shape: (1, seq_len)

    for _ in range(max_length - len(input_ids[0])):
        with torch.no_grad():
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :] / temperature
            next_token_probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(next_token_probs, num_samples=1)
            input_ids = torch.cat((input_ids, next_token), dim=1)

            if next_token.item() == tokenizer.token_to_id('<eos>'):
                break

    generated_text = tokenizer.decode(input_ids.squeeze().tolist())
    return generated_text

# Example usage
input_text = "ای دل گر از آن چاه زنخدان به درآیی"
generated_text = generate_text(model, tokenizer, input_text, max_length=48, temperature=1.0)
print(generated_text)

In [219]:
# Initialize model, criterion, optimizer, and scheduler
model = Decoder(vocab_size, d_model, nhead, num_layers, max_len, d_ff)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)  # Example scheduler

In [218]:
# new model
vocab_size = 20000
d_model = 768
nhead = 12 
num_layers = 8
max_len = 64
d_ff = 3072
learning_rate = 1e-5

In [182]:
# Training loop with early stopping and learning rate scheduler
num_epochs = 10
early_stopping_patience = 5
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch[0]
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch[0]
            outputs = model(input_ids)
            loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_2.pt')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

    scheduler.step()  # Step the scheduler

# Load the best model
model.load_state_dict(torch.load('best_model_2.pt'))

# Evaluation on the test set
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
        test_loss += loss.item()

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")


Epoch 1/10, Train Loss: 0.00018994479081469744
Epoch 1/10, Validation Loss: 0.17879455501237249
Epoch 2/10, Train Loss: 0.00018797446795588876
Epoch 2/10, Validation Loss: 0.17801189200976228
Epoch 3/10, Train Loss: 0.00018423027749511488
Epoch 3/10, Validation Loss: 0.17763244370116463
Epoch 4/10, Train Loss: 0.00018375191746114014
Epoch 4/10, Validation Loss: 0.1780738322229325
Epoch 5/10, Train Loss: 0.0001843025260492309
Epoch 5/10, Validation Loss: 0.17754007121430168
Epoch 6/10, Train Loss: 0.00018400431483998585
Epoch 6/10, Validation Loss: 0.17763497998725764
Epoch 7/10, Train Loss: 0.00018445390432994375
Epoch 7/10, Validation Loss: 0.17818778762711754
Epoch 8/10, Train Loss: 0.00018276167848389798
Epoch 8/10, Validation Loss: 0.17832909632898583
Epoch 9/10, Train Loss: 0.00018234126976715293
Epoch 9/10, Validation Loss: 0.17818146142401273
Epoch 10/10, Train Loss: 0.00018105168879375987
Epoch 10/10, Validation Loss: 0.1782459442421228
Early stopping triggered
Test Loss: 0.176

In [220]:
model.load_state_dict(torch.load('best_model_2.pt'))
model.eval()

Decoder(
  (embedding): GPT2Embedding(
    (token_embedding): Embedding(20000, 768)
    (position_encoding): PositionalEncoding()
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-7): 8 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (linear_q): Linear(in_features=768, out_features=768, bias=True)
        (linear_k): Linear(in_features=768, out_features=768, bias=True)
        (linear_v): Linear(in_features=768, out_features=768, bias=True)
        (linear_out): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=768, out_features=3072, bias=True)
        (relu): ReLU()
        (linear2): Linear(in_features=3072, out_features=768, bias=True)
      )
      (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplac

In [240]:
# Example usage
input_text = "ای دل گر از آن چاه زنخدان به درآيی"
generated_text = generate_text(model, tokenizer, input_text, max_length=64, temperature=17)
print(generated_text)

ای دل گر از آن چاه زنخدان به درآيی وداع مگير بيفشانی خونين بسوزی لعورش قلمفراق کاری مغبازده مومعی صحيفه ببر توان شمار بريده جانادبير کلام


ای دل گر از آن چاه زنخدان به درآيی
وداع مگير بيفشانی خونين بسوزی
لعورش قلمفراق کاری مغبازده مومعی صحيفه
ببر توان شمار بريده جانا دبير کلام