In [20]:
import os
import torch
import torch.nn as nn
from tokenizers import ByteLevelBPETokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset,random_split
import math
import torch.optim as optim
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from gensim.models import KeyedVectors
import fasttext.util
import numpy as np

In [4]:
data_path = os.path.abspath("hafez.txt")
vocab_dir = "hafez_vocab"

assert os.path.isfile(data_path), f"{data_path} not found. Please upload the file."

with open(data_path, "r", encoding="utf-8") as file:
    text = file.read()

len(text)

276324

In [5]:
# Train the tokenizer
tokenizer = ByteLevelBPETokenizer()
data_path = "hafez.txt"
vocab_dir = "hafez_vocab"

tokenizer.train(files=[data_path], vocab_size=30000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
    "<lb>"
])

# Save the tokenizer model
if not os.path.exists(vocab_dir):
    os.makedirs(vocab_dir)
tokenizer.save_model(vocab_dir)

# Load the tokenizer
tokenizer = ByteLevelBPETokenizer(
    os.path.join(vocab_dir, "vocab.json"),
    os.path.join(vocab_dir, "merges.txt"),
)






In [7]:
# Split the text into verses using double newlines
verses = text.split("\n\n")

# Replace single newlines within verses with <lb> and remove any empty verses
verses = [verse.replace("\n", " <lb> ") for verse in verses if verse.strip()]

# Tokenize each verse
tokenized_verses = [tokenizer.encode(verse).ids for verse in verses]

# Pad or truncate the verses to have uniform length (max_len)
max_len = 48
padded_tokenized_verses = [verse + [tokenizer.token_to_id('<pad>')] * (max_len - len(verse)) for verse in tokenized_verses]

print(f"Number of padded verses: {len(padded_tokenized_verses)}")
print(f"Example padded verse: {padded_tokenized_verses[0]}")
print(tokenizer.decode(padded_tokenized_verses[0]))

Number of padded verses: 4191
Example padded verse: [177, 125, 129, 2010, 727, 5030, 483, 911, 3038, 6276, 289, 952, 3987, 263, 226, 33, 81, 71, 35, 296, 419, 1833, 1663, 1642, 831, 765, 1826, 307, 545, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
﻿الا يا ايها الساقی ادر کاسا و ناولها <lb> که عشق آسان نمود اول ولی افتاد مشکل‌ها<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [9]:
# Split data into training, validation, and test sets
train_data, test_val_data = train_test_split(padded_tokenized_verses, test_size=0.3, random_state=42)
val_data, test_data       = train_test_split(test_val_data, test_size=0.5, random_state=42)

print(f"Train data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

Train data size: 2933
Validation data size: 629
Test data size: 629


In [33]:
# Convert to PyTorch tensors
train_tensor = torch.tensor(train_data)
val_tensor = torch.tensor(val_data)
test_tensor = torch.tensor(test_data)

# Create TensorDataset
train_dataset = TensorDataset(train_tensor)
val_dataset = TensorDataset(val_tensor)
test_dataset = TensorDataset(test_tensor)

# Create DataLoaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [18]:
fasttext_model_path = 'cc.fa.300.vec'  # Pre-trained FastText vectors Path
fasttext_vectors = KeyedVectors.load_word2vec_format(fasttext_model_path)

In [23]:
vocab_size = len(tokenizer.get_vocab())
vocab_size

6630

In [24]:
def create_embedding_matrix(tokenizer, fasttext_vectors, embedding_dim):
    vocab = tokenizer.get_vocab()
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    word_to_idx = {word: idx for word, idx in vocab.items()}

    for word, idx in word_to_idx.items():
        if word in fasttext_vectors:
            embedding_matrix[idx] = fasttext_vectors[word]
        else:
            # Use a random vector or zero vector for unknown words
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    
    return embedding_matrix

# Example usage
embedding_dim = 300  # Dimension of FastText embeddings
embedding_matrix = create_embedding_matrix(tokenizer, fasttext_vectors, embedding_dim)

In [15]:
class PretrainedEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pretrained_weights):
        super(PretrainedEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_weights, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze the weights if you don't want them to be updated

    def forward(self, input_ids):
        return self.embedding(input_ids)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        seq_len = x.size(1)
        if seq_len > self.encoding.size(1):
            position = torch.arange(0, seq_len).unsqueeze(1).float().to(x.device)
            div_term = torch.exp(torch.arange(0, self.encoding.size(2), 2).float() * -(math.log(10000.0) / self.encoding.size(2))).to(x.device)
            encoding = torch.zeros(seq_len, self.encoding.size(2)).to(x.device)
            encoding[:, 0::2] = torch.sin(position * div_term)
            encoding[:, 1::2] = torch.cos(position * div_term)
            encoding = encoding.unsqueeze(0)
        else:
            encoding = self.encoding[:, :seq_len, :]
        return x + encoding.to(x.device)
    
class GPT2Embedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, pretrained_weights):
        super(GPT2Embedding, self).__init__()
        self.token_embedding = PretrainedEmbedding(vocab_size, d_model, pretrained_weights)
        self.position_encoding = PositionalEncoding(d_model, max_len)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input_ids):
        token_embeddings = self.token_embedding(input_ids)
        position_embeddings = self.position_encoding(token_embeddings)
        embeddings = self.layer_norm(position_embeddings)
        return embeddings

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super(MultiHeadAttention, self).__init__()
        assert d_model % nhead == 0
        self.d_k = d_model // nhead
        self.nhead = nhead
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.linear_out = nn.Linear(d_model, d_model)
        self.attention = None

    def scaled_dot_product_attention(self, query, key, value, mask=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = torch.softmax(scores, dim=-1)
        return torch.matmul(p_attn, value), p_attn

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query = self.linear_q(query).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        key = self.linear_k(key).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        value = self.linear_v(value).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        x, self.attention = self.scaled_dot_product_attention(query, key, value, mask)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.nhead * self.d_k)
        return self.linear_out(x), self.attention

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, d_ff=2048):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, nhead)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.layer_norm1(x)
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.layer_norm2(x)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_len, d_ff, pretrained_weights):
        super(Decoder, self).__init__()
        self.embedding = GPT2Embedding(vocab_size, d_model, max_len, pretrained_weights)
        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead, d_ff) for _ in range(num_layers)])
        self.linear_out = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids, mask=None):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x, mask)
        logits = self.linear_out(x)
        return logits
    

In [43]:
# hyperparameters
vocab_size = 30000
d_model = 300
nhead = 6
num_layers = 6
max_len = 48
d_ff = 2048
learning_rate = 1e-4

In [44]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [31]:
# model = Decoder(vocab_size, d_model, nhead, num_layers, max_len, d_ff, embedding_matrix)
criterion = nn.CrossEntropyLoss()

In [45]:
num_epochs = 20
early_stopping_patience = 5
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch[0]
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch[0]
            outputs = model(input_ids)
            loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_3.pt')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

    scheduler.step()

# Load the best model
model.load_state_dict(torch.load('best_model_3.pt'))

# Evaluation on the test set
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        loss = criterion(outputs[:, :-1].reshape(-1, vocab_size), input_ids[:, 1:].reshape(-1))
        test_loss += loss.item()

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

Epoch 1/20, Train Loss: 0.0012892781220295508
Epoch 1/20, Validation Loss: 0.19201218858361244
Epoch 2/20, Train Loss: 0.0013231848437420051
Epoch 2/20, Validation Loss: 0.19722489304840565
Epoch 3/20, Train Loss: 0.0015287892469053115
Epoch 3/20, Validation Loss: 0.191148653998971
Epoch 4/20, Train Loss: 0.0011166712502017617
Epoch 4/20, Validation Loss: 0.18856663927435874
Epoch 5/20, Train Loss: 0.0009097629669628551
Epoch 5/20, Validation Loss: 0.1929286364465952
Epoch 6/20, Train Loss: 0.0005074602203145016
Epoch 6/20, Validation Loss: 0.1840894803404808
Epoch 7/20, Train Loss: 0.0005788098601244754
Epoch 7/20, Validation Loss: 0.18542455434799193
Epoch 8/20, Train Loss: 0.0006118243945820723
Epoch 8/20, Validation Loss: 0.1842499364167452
Epoch 9/20, Train Loss: 0.0003031726324558764
Epoch 9/20, Validation Loss: 0.1854371376335621
Epoch 10/20, Train Loss: 0.0003317824791694242
Epoch 10/20, Validation Loss: 0.18403465077280998
Epoch 11/20, Train Loss: 0.0001969152051416408
Epoch 1

In [46]:
model.load_state_dict(torch.load('best_model_3.pt'))
model.eval()

Decoder(
  (embedding): GPT2Embedding(
    (token_embedding): PretrainedEmbedding(
      (embedding): Embedding(30000, 300)
    )
    (position_encoding): PositionalEncoding()
    (layer_norm): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (linear_q): Linear(in_features=300, out_features=300, bias=True)
        (linear_k): Linear(in_features=300, out_features=300, bias=True)
        (linear_v): Linear(in_features=300, out_features=300, bias=True)
        (linear_out): Linear(in_features=300, out_features=300, bias=True)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=300, out_features=2048, bias=True)
        (relu): ReLU()
        (linear2): Linear(in_features=2048, out_features=300, bias=True)
      )
      (layer_norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (layer_norm2): LayerNorm((300,), eps=1e-05, elementwise_affin

In [186]:
def generate_text(model, tokenizer, input_text, max_length=100, temperature=1.0, top_k=50):
    model.eval()
    input_ids = tokenizer.encode(input_text).ids
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Shape: (1, seq_len)

    vocab_size = model.embedding.token_embedding.embedding.num_embeddings
    generated = input_ids

    for _ in range(max_length - len(input_ids[0])):
        with torch.no_grad():
            outputs = model(generated)
            next_token_logits = outputs[:, -1, :] / temperature

            # Apply top-k sampling
            top_k_probs, top_k_indices = torch.topk(torch.softmax(next_token_logits, dim=-1), top_k)
            next_token = torch.multinomial(top_k_probs, num_samples=1)
            next_token = top_k_indices.gather(1, next_token)

            # Ensure the sampling does not generate out-of-vocabulary tokens
            while next_token.item() >= vocab_size:
                next_token = torch.multinomial(top_k_probs, num_samples=1)
                next_token = top_k_indices.gather(1, next_token)

            generated = torch.cat((generated, next_token), dim=1)

            if next_token.item() == tokenizer.token_to_id('<eos>'):
                break

    generated_text = tokenizer.decode(generated.squeeze().tolist())
    return generated_text

# Example usage
input_text = "ای دل گر از آن چاه زنخدان به درآیی"
generated_text = generate_text(model, tokenizer, input_text, max_length=64, temperature=5, top_k=50)
print(generated_text)

ای دل گر از آن چاه زنخدان به درآیی بود بروم پرست درآيی درآيی وای رمانی رمانی ارزد نوشت جاميدن بازرسان شست هوا برين چو دولت آمده مقامدم هوا چو بندگان بازرسان خداداده پاک هوا رقم حور گلرنگ بگشا غارت صوابسی سفله سيه الله هوا دم گرديدن بودی مستش رفتم معتقد عتاب ديده الف ازايتی جور سحر زرد
