# Arabic Language Modeling Comparison
This notebook implements and compares three different architectures for Arabic language modeling:
- N-gram model (traditional statistical approach)
- LSTM-based RNN
- Transformer with Flash Attention
- Mamba SSM

The models are trained on Arabic text data and evaluated using perplexity metrics. All models use the same BERT tokenizer, word embeddings and close parameters sizes for fair comparison.

## Setup and Configuration
Key configurations:
- Context size: 4
- Embedding dimension: 512
- Sequence length: 64
- Batch size: 256
- Training split: 90%

In [1]:
import os
import glob
from tokenizers import BertWordPieceTokenizer, normalizers
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F
import pickle


np.random.seed(seed=1)
torch.manual_seed(1)
os.environ["TOKENIZERS_PARALLELISM"] = "true"
torch.multiprocessing.set_start_method('spawn', force=True)
CONTEXT_SIZE = 4
EMBEDDING_DIM = 512
files = [name for name in glob.glob('data/archive (2)/arwiki_books_shards/content/sharded/*.txt')] 
np.random.shuffle(files)
vocab_path = 'data/archive (2)/ar_bert32k-vocab.txt'
tokenizer = BertWordPieceTokenizer(vocab_path,strip_accents=False)
max_length=2*CONTEXT_SIZE+1

## 1. N-gram Model Implementation
Traditional N-gram model implementation with:
- N = 2 (bigram model)
- Basic smoothing for unseen sequences

In [2]:
import nltk
from nltk.util import ngrams
from nltk import FreqDist
import tqdm
import string


n = 2

def process_files(files):
    tokens = []
    translation_table = str.maketrans('', '', string.punctuation)
    
    for file_path in tqdm.tqdm(files):
        with open(file_path, 'r') as content_file:
            text = content_file.read().translate(translation_table)
            tokens.extend(tokenizer.encode(text).ids)

    print('Done')
    return tokens

def build_ngram_model(tokens, n=n):
    n_grams = ngrams(tokens, n, pad_right=True)
    return FreqDist(n_grams)

def predict_next_word(model, context, num_suggestions=1):
    context = context.translate(str.maketrans('', '', string.punctuation))
    context_tokens = tokenizer.encode(context, add_special_tokens=False).ids
    
    context_ngrams = {ngram[-1]: count for ngram, count in model.items() if ngram[:-1] == tuple(context_tokens)}
    sorted_ngrams = sorted(context_ngrams.items(), key=lambda item: item[1], reverse=True)
    return [tokenizer.id_to_token(word) for word, _ in sorted_ngrams[:num_suggestions]]

In [3]:
train_size=int(len(files)*0.9)

In [None]:
train_tokens = process_files(files[:train_size])
n_gram_model = build_ngram_model(train_tokens, n)

In [None]:

# save model
if not os.path.exists('ngram.pkl'):
        with open('ngram.pkl', 'wb') as f:
                pickle.dump(n_gram_model, f)

In [None]:
with open('ngram.pkl', 'rb') as f:
    n_gram_model = pickle.load(f)

In [None]:
len(n_gram_model)

47823068

In [None]:
start_sequence ='الكتاب'
predicted_words = predict_next_word(n_gram_model, start_sequence, num_suggestions=10)
print(predicted_words)

['والسنة', '،', 'من', 'في', 'المقدس', 'الذي', 'على', 'لا', 'إلا', 'والحكمة']


In [None]:
def calculate_perplexity_ngram(model, test_tokens, n=2):
    n_grams =list(ngrams(test_tokens, n, pad_right=True))
    total_log_prob = 0
    for n_gram in n_grams:
        context = n_gram[:-1]
        word = n_gram[-1]
        if context in model:
            context_prob = model[context].freq(word) / model[context].N()
        else:
            context_prob = 1 / model.N()  # Smoothing
        total_log_prob += np.log(context_prob)
    
    perplexity = np.exp(-total_log_prob / len(n_grams))
    return perplexity


In [None]:
test_tokens = process_files(files[train_size:])
ngram_perplexity = calculate_perplexity_ngram(n_gram_model, test_tokens, n=n)
print(f'N-Gram Model Perplexity: {ngram_perplexity}')

100%|██████████| 52/52 [05:40<00:00,  6.55s/it]


Done
N-Gram Model Perplexity: 690866710.0103099


## 2. LSTM-based RNN Model
Implementation details:
- 7-layer LSTM architecture
- Shared embedding layer with initial word2vec weights
- Uses AdamW optimizer with weight decay
- Linear learning rate schedule

In [2]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(f'checkpoints/word2vec_{EMBEDDING_DIM}_c{CONTEXT_SIZE}_n{CONTEXT_SIZE}_cobw.bin')
not_train= []
embedding=nn.Embedding(tokenizer.get_vocab_size(),EMBEDDING_DIM)
for word, idx in tokenizer.get_vocab().items():
    if model.has_index_for(word):
        with torch.no_grad():
            embedding.weight[idx]=torch.tensor(model.get_vector(word))
            not_train.append(idx)


In [3]:
# def criterion(output,true_t):
#     return 1- torch.mean(F.cosine_similarity(output,embedding(true_t)))  
criterion = nn.CrossEntropyLoss()


In [4]:
from utils import create_streaming_dataloaders

max_length = 64
batch_size = 256
batch_buffer_size = 100
stride = 32
train_size=int(len(files)*0.9)
trainloader = create_streaming_dataloaders(
    files=files[:train_size],
    tokenizer=tokenizer,
    batch_size=batch_size,  
    sequence_length=max_length,  
    stride=stride,  
    batch_buffer_size=batch_buffer_size,
    shuffle_buffer=True,
)

validloader = create_streaming_dataloaders(
    files=files[train_size:],
    tokenizer=tokenizer,
    batch_size=batch_size,  
    sequence_length=max_length,  
    stride=max_length,  
    batch_buffer_size=batch_buffer_size,
    shuffle_buffer= False

)

In [5]:


num_layers=7

# embedding=nn.Embedding(tokenizer.get_vocab_size(),EMBEDDING_DIM)
# embedding.load_state_dict(torch.load('gramEmbed.pt'))
# embedding.weight.requires_grad=False
class StarLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(StarLSTM, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # self.lstm2 = nn.LSTM( hidden_size, input_size, num_layers, batch_first=True)

    def forward(self, x):
        # Initialize hidden state with zeros
        # h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        
        # Forward propagate LSTM
        o, (hn, cn) = self.lstm1(x)
        # print(h[0].shape)
        # o, (hn, cn) = self.lstm2(hn[-1], (h0, c0))
   
        return o

lstm=StarLSTM(EMBEDDING_DIM, EMBEDDING_DIM, num_layers=num_layers)
#lstm.load_state_dict(torch.load(f'best_lstm{num_layers}.pt'))

In [6]:
f"parameters : {sum(p.numel() for p in lstm.parameters() if p.requires_grad):,}"

'parameters : 14,708,736'

In [7]:
embedding.weight.shape

torch.Size([32000, 512])

In [8]:
total_epochs=6
learning_rate =0.001  
weight_decay = 0.002     
# momentum=0.9 

In [9]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.AdamW( [{'params':lstm.parameters(),'lr': learning_rate}, {'params':embedding.parameters(), 'lr': learning_rate*0.3}] ,weight_decay=weight_decay,)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer,start_factor=1, end_factor=0.1, total_iters=total_epochs)
# torch.set_float32_matmul_precision('high')
# torch.backends.cuda.matmul.allow_tf32=True
# torch.backends.cudnn.allow_tf32=True

In [10]:
lstm=lstm.to(device)

In [11]:
embedding.weight.shape[0]

32000

In [12]:
from utils import train, setup_training_state
lstm = torch.compile(lstm)
lstm, embedding, start_epoch = setup_training_state(
        model=lstm,
        embedding=embedding,
        checkpoint_dir='checkpoints',
        num_layers=num_layers,
        prefix='LSTMembedAslinear'
    )
# load_model_and_embedding(lstm, embedding, prefix='embedAslinear', num_layers=num_layers)

No checkpoint found, starting fresh training


In [13]:
train_losses, valid_losses=train(lstm, embedding, optimizer, scheduler, total_epochs - start_epoch, trainloader, validloader,criterion,num_layers, use_amp=True, prefix='LSTMembedAslinear')

Epoch 1 Loss: 5.1930: : 86888it [1:59:24, 12.13it/s]
Validating: 5246it [03:42, 23.55it/s]


Epoch:1 | train loss:5.1146 | valid loss:0.0193 | lr: [0.001, 0.0003]
----------------------------------------------------------------------------------------------------


Epoch 2 Loss: 5.0016: : 86888it [1:59:00, 12.17it/s]
Validating: 5246it [03:42, 23.59it/s]


Epoch:2 | train loss:4.7566 | valid loss:0.0187 | lr: [0.00085, 0.00025499999999999996]
----------------------------------------------------------------------------------------------------


Epoch 3 Loss: 4.9578: : 86888it [1:58:33, 12.21it/s]
Validating: 5246it [03:44, 23.36it/s]


Epoch:3 | train loss:4.6515 | valid loss:0.0184 | lr: [0.0006999999999999999, 0.00020999999999999995]
----------------------------------------------------------------------------------------------------


Epoch 4 Loss: 4.8908: : 86888it [1:58:29, 12.22it/s]
Validating: 5246it [03:41, 23.66it/s]


Epoch:4 | train loss:4.5852 | valid loss:0.0182 | lr: [0.0005499999999999999, 0.00016499999999999997]
----------------------------------------------------------------------------------------------------


Epoch 5 Loss: 4.8507: : 86888it [1:58:21, 12.24it/s]
Validating: 5246it [03:41, 23.67it/s]


Epoch:5 | train loss:4.5354 | valid loss:0.0180 | lr: [0.00039999999999999996, 0.00011999999999999999]
----------------------------------------------------------------------------------------------------


Epoch 6 Loss: 4.8502: : 86888it [1:58:27, 12.22it/s]
Validating: 5246it [03:42, 23.58it/s]


Epoch:6 | train loss:4.4946 | valid loss:0.0178 | lr: [0.00025, 7.5e-05]
----------------------------------------------------------------------------------------------------


In [14]:
from utils import calculate_perplexity
rnn_perplexity = calculate_perplexity(lstm,embedding, validloader, device)
print(f'RNN Model Perplexity: {rnn_perplexity}')

RNN Model Perplexity: 95.08717125843353


In [17]:
from utils import sample_from_model
start_sequence = 'هذا الكتاب'
generated_text = sample_from_model(lstm,embedding,tokenizer,start_sequence,top_k=10,num_generate=20)
print("Generated text:", generated_text)

Generated text: ['هذا الكتاب ، ولم يزل كذلك إلى أن مات. قال ابن الجوزي كان من أهل الأدب ، كثير التواضع وحسن الخلق']


## 3. Transformer Model
Key features:
- Flash Attention for efficiency
- 6 transformer layers
- Feed-forward network with expansion ratio 2.66
- Dropout rate 0.1
- Shared embedding layer with learnable positional encodings

In [2]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(f'checkpoints/word2vec_{EMBEDDING_DIM}_c{CONTEXT_SIZE}_n{CONTEXT_SIZE}_cobw.bin')
not_train= []
embedding=nn.Embedding(tokenizer.get_vocab_size(),EMBEDDING_DIM)
for word, idx in tokenizer.get_vocab().items():
    if model.has_index_for(word):
        with torch.no_grad():
            embedding.weight[idx]=torch.tensor(model.get_vector(word))
            not_train.append(idx)


In [3]:
# def criterion(output,true_t):
#     return 1- torch.mean(F.cosine_similarity(output,embedding(true_t)))  
criterion = nn.CrossEntropyLoss()


In [4]:
from utils import create_streaming_dataloaders

max_length = 64
batch_size = 256
batch_buffer_size = 100
stride = 32
train_size=int(len(files)*0.9)
trainloader = create_streaming_dataloaders(
    files=files[:train_size],
    tokenizer=tokenizer,
    batch_size=batch_size,  
    sequence_length=max_length,  
    stride=stride,  
    batch_buffer_size=batch_buffer_size,
    num_workers=8,
    shuffle_buffer=True,
)

validloader = create_streaming_dataloaders(
    files=files[train_size:],
    tokenizer=tokenizer,
    batch_size=batch_size,  
    sequence_length=max_length,  
    stride=max_length,  
    batch_buffer_size=batch_buffer_size,
    num_workers=4,
    shuffle_buffer= False,

)

In [5]:
from torch.nn.attention import SDPBackend, sdpa_kernel

class FlashAttention(nn.Module):
    def __init__(self, dim: int, num_heads: int, dropout: float = 0.0):
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        
        self.q_proj = nn.Linear(dim, dim)
        self.k_proj = nn.Linear(dim, dim)
        self.v_proj = nn.Linear(dim, dim)
        self.out_proj = nn.Linear(dim, dim)
        self.dropout = dropout

    def forward(self, x: torch.Tensor, mask = None):
        batch_size, seq_len, dim = x.shape
        
        # reshaping directly without transposing would result in incorrect tensor dimensions will not reshape on dim // num_heads
        q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        
        # Flash attention implementation
        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
            attn_output = F.scaled_dot_product_attention(
                q, k, v,
                attn_mask=mask,
                dropout_p=self.dropout if self.training else 0.0,
                is_causal=mask is None
            )
        
        # Reshape output .contiguous()
        attn_output = attn_output.transpose(1, 2).view(batch_size, seq_len, self.dim)
        return self.out_proj(attn_output)
    
class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 2.0, dropout: float = 0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = FlashAttention(dim, num_heads, dropout)
        self.norm2 = nn.LayerNorm(dim)
        self.ff = FeedForward(dim, int(dim * mlp_ratio), dropout)
    
    def forward(self, x: torch.Tensor, mask = None):
        x = x + self.attn(self.norm1(x), mask)
        x = x + self.ff(self.norm2(x))
        return x

class DecoderTransformer(nn.Module):
    def __init__(
        self,
        dim: int = 512,
        depth: int = 6,
        num_heads: int = 8,
        mlp_ratio: float = 2.5,
        dropout: float = 0.1,
        max_seq_length: int = 1024,
    ):
        super().__init__()
        
        self.position_embedding = nn.Parameter(torch.zeros(1, max_seq_length, dim))
        self.dropout = nn.Dropout(dropout)
        
        self.layers = nn.ModuleList([
            TransformerBlock(dim, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(dim)
        
    def forward(self, x: torch.Tensor, mask = None):
        seq_len = x.size(1)
        
        x = x + self.position_embedding[:, :seq_len]
        x = self.dropout(x)
        
        # Apply transformer blocks
        for layer in self.layers:
            x = layer(x, mask)
            
        x = self.norm(x)
        return x

In [6]:
depth =6
model = DecoderTransformer(dim=EMBEDDING_DIM, depth=depth, mlp_ratio=2.66, max_seq_length=max_length)

In [7]:
f"parameters : {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"

'parameters : 14,723,046'

In [8]:
total_epochs=6
learning_rate =0.0006999999999999999 # to complete from the0.0006999999999999999 last iter 
weight_decay = 0.002     

In [9]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.AdamW( [{'params':model.parameters()}, {'params':embedding.parameters(), 'lr': learning_rate*0.2}] # 0.2 because 0.3 caused nan values
                               , lr=learning_rate,weight_decay=weight_decay,)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer,start_factor=1, end_factor=0.1, total_iters=total_epochs)
# torch.set_float32_matmul_precision('high')
# torch.backends.cuda.matmul.allow_tf32=True
# torch.backends.cudnn.allow_tf32=True
model=model.to(device)

In [10]:
from utils import train, setup_training_state
model = torch.compile(model)
model, embedding, start_epoch = setup_training_state(
        model=model,
        embedding=embedding,
        checkpoint_dir='checkpoints',
        num_layers=depth,
        prefix='TraEmbedAslinear'
    )

Loading checkpoint from checkpoints/TraEmbedAslinearlast_checkpoint6.pt
Successfully loaded checkpoint from epoch 3
Resumed training from epoch 3


  checkpoint = torch.load(checkpoint_path, map_location=device)


In [11]:
train_losses,valid_losses=train(model, embedding, optimizer, scheduler, total_epochs - start_epoch, trainloader, validloader,criterion,depth, use_amp=True, prefix='TraEmbedAslinear')

Epoch 1 Loss: 4.4562: : 56636it [1:32:08, 12.12it/s]

skip: batch{batch_idx} for nan


Epoch 1 Loss: 4.8792: : 86887it [2:21:27, 10.24it/s]
Validating: 5244it [04:41, 18.60it/s]


Epoch:1 | train loss:4.7446 | valid loss:0.0181 | lr: [0.0006999999999999999, 0.00014]
----------------------------------------------------------------------------------------------------


Epoch 2 Loss: 4.7970: : 86887it [2:21:11, 10.26it/s]
Validating: 5244it [04:32, 19.23it/s]


Epoch:2 | train loss:4.6831 | valid loss:0.0179 | lr: [0.0005949999999999999, 0.00011899999999999999]
----------------------------------------------------------------------------------------------------


Epoch 3 Loss: 4.7491: : 86887it [2:21:28, 10.24it/s]
Validating: 5244it [04:32, 19.23it/s]


Epoch:3 | train loss:4.6402 | valid loss:0.0178 | lr: [0.0004899999999999999, 9.799999999999998e-05]
----------------------------------------------------------------------------------------------------


In [12]:
from utils import calculate_perplexity
tran_perplexity = calculate_perplexity(model,embedding, validloader, device)
print(f'Transformer Model Perplexity: {tran_perplexity}')

Transformer Model Perplexity: 93.86618531236887


In [29]:
from utils import sample_from_model
start_sequence = 'هذا الكتاب'
generated_text = sample_from_model(model,embedding,tokenizer,start_sequence,top_k=10,num_generate=20)
print("Generated text:", generated_text)

Generated text: ['هذا الكتاب من عند الله وهو كتاب أنزله الله على نبيه محمد صلى الله عليه و سلم قال أبو بكر حدثنا محمد']


## 4. Mamba SSM Model
Implementation using Mamba State Space Model:
- 6 Mamba layers
- State dimension: 64
- Convolution width: 4
- Expansion factor: 3
- Scaled embedding with sqrt(dim) * 2

Training Stability Notes:
- Learning rate reduced to 1e-4 for stability
- Embedding scaling factor (sqrt(dim) * 2) crucial for preventing NaN losses
- Embedding learning rate set to main_lr * 0.2
- These adjustments were necessary to prevent training instability and NaN values


In [2]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(f'checkpoints/word2vec_{EMBEDDING_DIM}_c{CONTEXT_SIZE}_n{CONTEXT_SIZE}_cobw.bin')
not_train= []
embedding=nn.Embedding(tokenizer.get_vocab_size(),EMBEDDING_DIM)
for word, idx in tokenizer.get_vocab().items():
    if model.has_index_for(word):
        with torch.no_grad():
            embedding.weight[idx]=torch.tensor(model.get_vector(word))
            not_train.append(idx)


In [3]:
# def criterion(output,true_t):
#     return 1- torch.mean(F.cosine_similarity(output,embedding(true_t)))  
criterion = nn.CrossEntropyLoss()


In [4]:
from utils import create_streaming_dataloaders

max_length = 64
batch_size = 256
batch_buffer_size = 100
stride = 32
train_size=int(len(files)*0.9)
trainloader = create_streaming_dataloaders(
    files=files[:train_size],
    tokenizer=tokenizer,
    batch_size=batch_size,  
    sequence_length=max_length,  
    stride=stride,  
    batch_buffer_size=batch_buffer_size,
    num_workers=0,
    shuffle_buffer=True,
)

validloader = create_streaming_dataloaders(
    files=files[train_size:],
    tokenizer=tokenizer,
    batch_size=batch_size,  
    sequence_length=max_length,  
    stride=max_length,  
    batch_buffer_size=batch_buffer_size,
    num_workers=0,
    shuffle_buffer= False,

)

In [5]:
from mamba_ssm import Mamba2


class MammbaModel(nn.Module):
    def __init__(
        self,
        dim: int = 512,
        depth: int = 6,
        dropout: float = 0.1,
    ):
        super().__init__()
        
        self.dropout = nn.Dropout(dropout)
        self.embed_scale = np.sqrt(dim) * 2

        self.layers = nn.ModuleList([
            Mamba2(
            # This module uses roughly 3 * expand * d_model^2 parameters
            d_model=dim, # Model dimension d_model
            d_state=64,  # SSM state expansion factor, typically 64 or 128
            d_conv=4,    # Local convolution width
            expand=3,    # Block expansion factor
            # dt_limit=0,
        )
        for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(dim)
        
    def forward(self, x: torch.Tensor):
        x = self.dropout(x / self.embed_scale)
        
        # Apply transformer blocks
        for layer in self.layers:
            x = layer(x)
            
        x = self.norm(x)
        return x

In [6]:
depth =6
model = MammbaModel(dim=EMBEDDING_DIM, depth=depth)

In [7]:
f"parameters : {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"

'parameters : 14,683,312'

In [8]:
total_epochs=6
learning_rate =1e-4  # to complete from the last iter 
weight_decay = 0.002     

In [9]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.AdamW( [{'params':model.parameters()}, {'params':embedding.parameters(), 'lr': learning_rate*0.2}] 
                               , lr=learning_rate,weight_decay=weight_decay,)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer,start_factor=1, end_factor=0.1, total_iters=total_epochs)
# torch.set_float32_matmul_precision('high')
# torch.backends.cuda.matmul.allow_tf32=True
# torch.backends.cudnn.allow_tf32=True
model=model.to(device)

In [10]:
from utils import train, setup_training_state
# model = torch.compile(model)
model, embedding, start_epoch = setup_training_state(
        model=model,
        embedding=embedding,
        checkpoint_dir='checkpoints',
        num_layers=depth,
        prefix='MambaEmbedAslinear'
    )

No checkpoint found, starting fresh training


In [11]:
train_losses,valid_losses=train(model, embedding, optimizer, scheduler, total_epochs - start_epoch, trainloader, validloader,criterion,depth, use_amp=True, prefix='MambaEmbedAslinear')

Epoch 1 Loss: 5.1022: : 86833it [3:24:19,  7.08it/s]
Validating: 5237it [05:15, 16.58it/s]


Epoch:1 | train loss:5.4972 | valid loss:0.0191 | lr: [0.0001, 2e-05]
----------------------------------------------------------------------------------------------------


Epoch 2 Loss: 4.9680: : 86833it [3:27:04,  6.99it/s]
Validating: 5237it [05:15, 16.59it/s]


Epoch:2 | train loss:4.8150 | valid loss:0.0185 | lr: [8.5e-05, 1.7e-05]
----------------------------------------------------------------------------------------------------


Epoch 3 Loss: 4.8196: : 86833it [3:27:37,  6.97it/s]
Validating: 5237it [05:15, 16.58it/s]


Epoch:3 | train loss:4.7007 | valid loss:0.0182 | lr: [7.000000000000001e-05, 1.4e-05]
----------------------------------------------------------------------------------------------------


Epoch 4 Loss: 4.8320: : 86833it [3:28:25,  6.94it/s]
Validating: 5237it [05:16, 16.54it/s]


Epoch:4 | train loss:4.6360 | valid loss:0.0180 | lr: [5.5e-05, 1.1e-05]
----------------------------------------------------------------------------------------------------


Epoch 5 Loss: 4.7473: : 86833it [3:28:18,  6.95it/s]
Validating: 5237it [05:16, 16.54it/s]


Epoch:5 | train loss:4.5923 | valid loss:0.0179 | lr: [4e-05, 8e-06]
----------------------------------------------------------------------------------------------------


Epoch 6 Loss: 4.7137: : 86833it [3:28:16,  6.95it/s]
Validating: 5237it [05:17, 16.49it/s]


Epoch:6 | train loss:4.5600 | valid loss:0.0178 | lr: [2.5e-05, 4.9999999999999996e-06]
----------------------------------------------------------------------------------------------------


In [12]:
from utils import calculate_perplexity
tran_perplexity = calculate_perplexity(model,embedding, validloader, device)
print(f'Transformer Model Perplexity: {tran_perplexity}')

Transformer Model Perplexity: 94.16253863516155


In [27]:
from utils import sample_from_model
start_sequence = 'هذا الكتاب'
generated_text = sample_from_model(model,embedding,tokenizer,start_sequence,top_k=10,num_generate=20)
print("Generated text:", generated_text)

Generated text: ['هذا الكتاب ، وهذا ما حدث في عهد الملك الناصر فرج في عهد السلطان صلاح الدين الأيوبي ، حيث كان قد']


## Implementation Challenges:
1. Threading and Tokenization:
   - Issues encountered with parallel file processing and batch encoding
   - System became stable by setting num_workers=0 and enabling TOKENIZERS_PARALLELISM instead of the file parallelism method
   - Possible memory/thread contention issues with encode_batch method

2. Training Observations:
   - Models trained for 6 epochs with stride=32 (effective 12 epochs)
   - Transformer showed more natural text generation
   - RNN and Mamba occasionally produced repetitive punctuation