In [None]:
! pip install einops datasets

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Function to get training data
def get_training_corpus():
    dataset = load_dataset("ayoubkirouane/Algerian-Darija", split="v1")
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["Text"]

# Initialize the base tokenizer
base_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

# Train the new tokenizer
new_tokenizer = base_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=3000)


new_tokenizer.pad_token = new_tokenizer.eos_token

fim_prefix_token = "<fim_prefix>"
fim_middle_token = "<fim_middle_token>"
fim_suffix_token = "<fim_suffix_token>"
fim_pad_token = "<fim_pad>"

# Get the FIM-specific tokens and get their token ids
new_tokenizer.add_tokens(
    [
        fim_prefix_token,
        fim_middle_token,
        fim_middle_token,
        fim_pad_token,
    ]
)
prefix_tok_id = new_tokenizer.convert_tokens_to_ids(fim_prefix_token)
middle_tok_id = new_tokenizer.convert_tokens_to_ids(fim_middle_token)
suffix_tok_id = new_tokenizer.convert_tokens_to_ids(fim_middle_token)
pad_tok_id = None

fim_tokens = [prefix_tok_id, middle_tok_id, suffix_tok_id]


# If truncate_or_pad is on, also get pad token id
truncate_or_pad = True
if truncate_or_pad:
    pad_tok_id = new_tokenizer.convert_tokens_to_ids(fim_pad_token)
    fim_tokens.append(pad_tok_id)

# Save the new tokenizer
new_tokenizer.save_pretrained("darija_tokenizer")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading readme:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2324 [00:00<?, ? examples/s]

Generating v1 split:   0%|          | 0/168655 [00:00<?, ? examples/s]

('darija_tokenizer/tokenizer_config.json',
 'darija_tokenizer/special_tokens_map.json',
 'darija_tokenizer/tokenizer.json')

In [None]:
import torch
import torch.nn.functional as F
import math
from einops import einsum, rearrange, repeat
from torch import Tensor, nn
from typing import Optional


class MultiHeadAttention(nn.Module):
    def __init__(
        self,
        dim: int,
        heads: int,
        dim_head: int,
        dropout: float = 0.1,
        use_linear_attn: bool = False,
    ):
        super().__init__()
        self.dim = dim
        self.heads = heads
        self.dim_head = dim_head
        self.dropout = dropout
        self.use_linear_attn = use_linear_attn

        inner_dim = heads * dim_head
        self.scale = dim_head ** -0.5

        # Query, Key, and Value projection layers
        self.to_q = nn.Linear(dim, inner_dim, bias=False)
        self.to_k = nn.Linear(dim, inner_dim, bias=False)
        self.to_v = nn.Linear(dim, inner_dim, bias=False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
        b, n, d = x.shape

        q = self.to_q(x).reshape(b, n, self.heads, self.dim_head).transpose(1, 2)
        k = self.to_k(x).reshape(b, n, self.heads, self.dim_head).transpose(1, 2)
        v = self.to_v(x).reshape(b, n, self.heads, self.dim_head).transpose(1, 2)

        q *= self.scale


        attn_output = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        out = attn_output.transpose(1, 2).reshape(b, n, -1)

        return self.to_out(out)



class LinearAttention(nn.Module):
    def __init__(self, dim, *, heads=4, dim_head=64, dropout=0.0):
        super().__init__()
        inner_dim = heads * dim_head
        self.heads = heads
        self.scale = dim_head**-0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask=None):
        h = self.heads
        # Get queries, keys, and values
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)

        # Reshape for multi-head attention
        q = q.reshape(q.shape[0], q.shape[1], h, -1).transpose(1, 2)
        k = k.reshape(k.shape[0], k.shape[1], h, -1).transpose(1, 2)
        v = v.reshape(v.shape[0], v.shape[1], h, -1).transpose(1, 2)

        q *= self.scale
        q = F.softmax(q, dim=-1)
        k = F.softmax(k, dim=-2)

        if mask is not None:
            k = k.masked_fill(mask, float('-inf'))

        # Compute context and output
        context = torch.einsum('b h n d, b h n e -> b h d e', q, k)
        out = torch.einsum('b h d e, b h n d -> b h n e', context, v)

        # Reshape back to original dimensions and apply the final linear layer
        out = out.transpose(1, 2).reshape(x.shape[0], x.shape[1], -1)
        return self.to_out(out)

class MambaBlock(nn.Module):
    def __init__(
        self,
        dim: int = None,
        depth: int = 5,
        d_state: int = 16,
        expand: int = 2,
        d_conv: int = 4,
        conv_bias: bool = True,
        bias: bool = False,
    ):
        """A single Mamba block, as described in Figure 3 in Section 3.4 in the Mamba paper [1]."""
        super().__init__()
        self.dim = dim
        self.depth = depth
        self.d_state = d_state
        self.expand = expand
        self.d_conv = d_conv
        self.conv_bias = conv_bias
        self.bias = bias

        # If dt_rank is not provided, set it to ceil(dim / d_state)
        dt_rank = math.ceil(self.dim / 16)
        self.dt_rank = dt_rank

        # If dim_inner is not provided, set it to dim * expand
        dim_inner = dim * expand
        self.dim_inner = dim_inner

        # If dim_inner is not provided, set it to dim * expand
        self.in_proj = nn.Linear(dim, dim_inner * 2, bias=bias)

        self.conv1d = nn.Conv1d(
            in_channels=dim_inner,
            out_channels=dim_inner,
            bias=conv_bias,
            kernel_size=d_conv,
            groups=dim_inner,
            padding=d_conv - 1,
        )

        # x_proj takes in `x` and outputs the input-specific Δ, B, C
        self.x_proj = nn.Linear(
            dim_inner, dt_rank + self.d_state * 2, bias=False
        )

        # dt_proj projects Δ from dt_rank to d_in
        self.dt_proj = nn.Linear(dt_rank, dim_inner, bias=True)

        A = repeat(torch.arange(1, self.d_state + 1), "n -> d n", d=dim_inner)
        self.A_log = nn.Parameter(torch.log(A))
        self.D = nn.Parameter(torch.ones(dim_inner))
        self.out_proj = nn.Linear(dim_inner, dim, bias=bias)

    def forward(self, x: Tensor):
        (b, l, d) = x.shape

        x_and_res = self.in_proj(x)  # shape (b, l, 2 * d_in)
        x_and_res = rearrange(x_and_res, "b l x -> b x l")
        (x, res) = x_and_res.split(
            split_size=[self.dim_inner, self.dim_inner], dim=1
        )

        x = self.conv1d(x)[:, :, :l]
        x = F.silu(x)

        y = self.ssm(x)

        y = y * F.silu(res)

        output = self.out_proj(rearrange(y, "b dim l -> b l dim"))

        return output

    def ssm(self, x: Tensor):

        (d_in, n) = self.A_log.shape

        A = -torch.exp(self.A_log.float())  # shape (d_in, n)
        D = self.D.float()

        x_dbl = rearrange(x, "b d l -> b l d")
        x_dbl = self.x_proj(x_dbl)  # (b, l, dt_rank + 2*n)

        (delta, B, C) = x_dbl.split(
            split_size=[self.dt_rank, n, n], dim=-1
        )  # delta: (b, l, dt_rank). B, C: (b, l, n)
        delta = F.softplus(self.dt_proj(delta))  # (b, l, d_in)

        y = self.selective_scan(
            x, delta, A, B, C, D
        )  # This is similar to run_SSM(A, B, C, u) in The Annotated S4 [2]

        return y

    def selective_scan(self, u, delta, A, B, C, D):

        (b, d_in, l) = u.shape
        n = A.shape[1]


        deltaA = torch.exp(einsum(delta, A, "b l d_in, d_in n -> b d_in l n"))
        deltaB_u = einsum(
            delta, B, u, "b l d_in, b l n, b d_in l -> b d_in l n"
        )

        # Perform selective scan (see scan_SSM() in The Annotated S4 [2])
        x = torch.zeros((b, d_in, n), device=next(self.parameters()).device)
        ys = []
        for i in range(l):
            x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
            y = einsum(x, C[:, i, :], "b d_in n , b n -> b d_in")
            ys.append(y)
        y = torch.stack(ys, dim=2)  # (b d_in l)

        if D is not None:
            y = y + u * rearrange(D, "d_in -> d_in 1")

        return y

class FeedForward(nn.Module):
    def __init__(
        self,
        dim: Optional[int] = None,
        dim_out: Optional[int] = None,
        mult: Optional[int] = 4,
        post_act_ln: Optional[bool] = False,
        dropout: Optional[float] = 0.0,
        no_bias: Optional[bool] = False,
        triton_kernels_on: bool = False,
    ):

        super().__init__()
        self.dim = dim
        self.dim_out = dim_out
        self.mult = mult
        self.post_act_ln = post_act_ln
        self.dropout = dropout
        self.no_bias = no_bias
        self.triton_kernels_on = triton_kernels_on

        inner_dim = int(dim * mult)
        dim_out = dim_out or dim  # Default to input dimension if not provided

        # Determine activation function
        activation = nn.SiLU()


        project_in = nn.Sequential(
                nn.Linear(dim, inner_dim, bias=not no_bias), activation
            )

        # Define feedforward network
        if post_act_ln:
            self.ff = nn.Sequential(
                project_in,
                nn.LayerNorm(inner_dim),
                nn.Dropout(dropout),
                nn.Linear(inner_dim, dim_out, bias=not no_bias),
            )
        else:
            self.ff = nn.Sequential(
                project_in,
                nn.Dropout(dropout),
                nn.Linear(inner_dim, dim_out, bias=not no_bias),
            )

    def forward(self, x):

        return self.ff(x)


class RMSNorm(nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        self.scale = dim ** (-0.5)
        self.g = nn.Parameter(torch.ones(dim))

    def forward(self, x: Tensor) -> Tensor:
        return F.normalize(x, dim=-1) * self.scale * self.g


class TransformerBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        heads: int,
        dim_head: int,
        dropout: float = 0.1,
        ff_mult: int = 4,
        use_linear_attn: bool = False,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.dim = dim
        self.heads = heads
        self.dim_head = dim_head
        self.dropout = dropout
        self.ff_mult = ff_mult
        self.use_linear_attn = use_linear_attn

        self.attn = MultiHeadAttention(dim, heads, *args, **kwargs)

        # Linear Attention
        self.linear_attn = LinearAttention(
            dim=dim, heads=heads, dim_head=dim_head, dropout=dropout
        )

        self.ffn = FeedForward(dim, dim, ff_mult, *args, **kwargs)

        # Normalization
        self.norm = nn.LayerNorm(dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Performs a forward pass of the TransformerBlock.

        Args:
            x (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor.

        """
        if self.use_linear_attn:
            x = self.linear_attn(x)
            x = self.norm(x)
            x = self.ffn(x)
        else:
            x, _, _ = self.attn(x)
            x = self.norm(x)
            x = self.ffn(x)

        return x


class MambaTransformerblock(nn.Module):

    def __init__(
        self,
        dim: int,
        heads: int,
        depth: int,
        dim_head: int,
        dropout: float = 0.1,
        ff_mult: int = 4,
        d_state: int = None,
        transformer_depth: int = 1,
        mamba_depth: int = 1,
        use_linear_attn: bool = False,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.dim = dim
        self.depth = depth
        self.dim_head = dim_head
        self.d_state = d_state
        self.dropout = dropout
        self.ff_mult = ff_mult
        self.d_state = d_state
        self.transformer_depth = transformer_depth
        self.mamba_depth = mamba_depth

        # Mamba, Transformer, and ffn blocks
        self.mamba_blocks = nn.ModuleList([
            MambaBlock(dim, mamba_depth, d_state, *args, **kwargs)
            for _ in range(mamba_depth)
        ])
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(
                dim,
                heads,
                dim_head,
                dropout,
                ff_mult,
                use_linear_attn,
                *args,
                **kwargs,
            ) for _ in range(transformer_depth)
        ])

        self.ffn_blocks = nn.ModuleList([
            FeedForward(dim, dim, ff_mult, *args, **kwargs)
            for _ in range(depth)
        ])

        # Layernorm
        self.norm = nn.LayerNorm(dim)

    def forward(self, x: Tensor) -> Tensor:
        for mamba, attn, ffn in zip(
            self.mamba_blocks,
            self.transformer_blocks,
            self.ffn_blocks,
        ):
            x = self.norm(x)
            x = mamba(x) + x
            x = self.norm(x)
            x = attn(x) + x
            x = self.norm(x)
            x = ffn(x) + x

        return x


class MambaTransformer(nn.Module):
    def __init__(
        self,
        num_tokens: int,
        dim: int,
        heads: int,
        depth: int,
        dim_head: int,
        dropout: float = 0.1,
        ff_mult: int = 4,
        d_state: int = None,
        return_embeddings: bool = False,
        transformer_depth: int = 1,
        mamba_depth: int = 1,
        use_linear_attn=False,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.dim = dim
        self.depth = depth
        self.dim_head = dim_head
        self.d_state = d_state
        self.dropout = dropout
        self.ff_mult = ff_mult
        self.d_state = d_state
        self.return_embeddings = return_embeddings
        self.transformer_depth = transformer_depth
        self.mamba_depth = mamba_depth

        self.emb = nn.Embedding(num_tokens, dim)
        self.mt_block = MambaTransformerblock(
            dim,
            heads,
            depth,
            dim_head,
            dropout,
            ff_mult,
            d_state,
            return_embeddings,
            transformer_depth,
            mamba_depth,
            use_linear_attn,
            *args,
            **kwargs,
        )
        self.to_logits = nn.Sequential(
            RMSNorm(dim), nn.Linear(dim, num_tokens)
        )

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass of the MambaTransformer model.

        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length).

        Returns:
            Tensor: Output tensor of shape (batch_size, sequence_length, num_tokens).
        """
        x = self.emb(x)
        x = self.mt_block(x)

        if self.return_embeddings:
            return x

        else:
            return self.to_logits(x)

In [None]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

class TextDataset(Dataset):
    def __init__(self, dataset_name, split, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.dataset = load_dataset(dataset_name, split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['Text']  # Adjust 'text' if your dataset uses a different column name
        encoding = self.tokenizer(text, truncation=True,
                                  padding='max_length',
                                  max_length=self.max_length,
                                  return_tensors='pt') # Ensure PyTorch Tensor output

        input_ids = encoding['input_ids'].squeeze()

        # Assuming you want to use the input_ids as labels for language modeling

        # Shift labels
        labels = input_ids.clone()

        labels[:-1] = input_ids[1:]  # Shift labels
        return input_ids, labels  # Return both input_ids and labels

In [None]:
# Create an instance of the MambaTransformer model
model = MambaTransformer(
    num_tokens=new_tokenizer.vocab_size,  # Number of tokens in the input sequence
    dim=512,  # Dimension of the model
    heads=8,  # Number of attention heads
    depth=4,  # Number of transformer layers
    dim_head=64,  # Dimension of each attention head
    d_state=512,  # Dimension of the state
    dropout=0.1,  # Dropout rate
    ff_mult=4,  # Multiplier for the feed-forward layer dimension
    return_embeddings=False,  # Whether to return the embeddings,
    transformer_depth=2,  # Number of transformer blocks
    mamba_depth=10,  # Number of Mamba blocks,
    use_linear_attn=True,  # Whether to use linear attention
)


In [None]:
model

MambaTransformer(
  (emb): Embedding(3000, 512)
  (mt_block): MambaTransformerblock(
    (mamba_blocks): ModuleList(
      (0-1): 2 x MambaBlock(
        (in_proj): Linear(in_features=512, out_features=1024, bias=False)
        (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
        (x_proj): Linear(in_features=512, out_features=1056, bias=False)
        (dt_proj): Linear(in_features=32, out_features=512, bias=True)
        (out_proj): Linear(in_features=512, out_features=512, bias=False)
      )
    )
    (transformer_blocks): ModuleList()
    (ffn_blocks): ModuleList(
      (0-3): 4 x FeedForward(
        (ff): Sequential(
          (0): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): SiLU()
          )
          (1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=2048, out_features=512, bias=True)
        )
   

In [None]:
import time
from tqdm import tqdm
def train(model: MambaTransformer,
          train_data: DataLoader,
          optimizer: torch.optim.Optimizer,

          val_data: DataLoader = None,
          epochs: int = 10,
          tokenizer: AutoTokenizer = new_tokenizer,
          device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
          clip_grad_norm: float = 1.0,
          lr_scheduler=None):
    """Trains the Mistral model.

    Args:
        model: The Mistral model to train.
        train_data: A DataLoader for the training dataset.
        optimizer: The optimizer to use for training.
        epochs: The number of training epochs.
        device: The device to use for training (e.g., 'cuda' or 'cpu').
        clip_grad_norm: The maximum norm of the gradients to clip.
        lr_scheduler: An optional learning rate scheduler.
    """

    model = model.to(device)
    model.train()

    print("Training...")
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        total_loss = 0.0
        start_time = time.time()

        for batch in tqdm(train_data, leave=False):
            input_ids, labels = batch

            input_ids, labels = input_ids.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids)

            # Calculate loss (use cross-entropy loss for language modeling)
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(outputs.view(-1, tokenizer.vocab_size), labels.view(-1))

            # Backward pass
            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)

            # Update weights
            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            total_loss += loss.item()


        avg_loss = total_loss / len(train_data)
        elapsed_time = time.time() - start_time
        print(f"Average loss: {avg_loss:.4f} | Elapsed time: {elapsed_time:.2f}s")

        if val_data is not None:
          # Evaluation Phase
          model.eval()
          eval_loss = 0
          with torch.no_grad():
              for step, batch in enumerate(val_data):
                  # Get input_ids and labels from the batch
                  input_ids, labels = batch
                  input_ids = input_ids.to(device)  # Send input_ids to the device
                  labels = labels.to(device)  # Send labels to the device

                  # Forward pass
                  outputs = model(input_ids)

                  # Calculate loss
                  loss = F.cross_entropy(outputs.view(-1, tokenizer.vocab_size), labels.view(-1), ignore_index=tokenizer.pad_token_id)
                  eval_loss += loss.item()
          avg_eval_loss = eval_loss / len(val_data)
          print(f"Epoch: {epoch+1}, Evaluation Loss: {avg_eval_loss:.4f}")
    model_save_path = "Hybrid.pt"
    torch.save(model.state_dict(), model_save_path)
    print("Training complete!")

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

In [None]:
train_dataset = TextDataset("ayoubkirouane/Algerian-Darija",
                      "v1",
                      new_tokenizer,
                      max_length=128)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=False)

In [None]:
train(model=model ,
      train_data=train_loader,
      optimizer=optimizer,
      tokenizer=new_tokenizer,
      epochs=1,
      clip_grad_norm=1.0,
      lr_scheduler=lr_scheduler)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_length = 30
num_return_sequences = 10


tokens = new_tokenizer.encode("نتا  ")
tokens = torch.tensor(tokens , dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)

x = tokens.to(device)

while x.size(1) < max_length:

    with torch.no_grad():
        outputs = model(x)
        logits = outputs[0] if isinstance(outputs, tuple) else outputs
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

        ix = torch.multinomial(topk_probs, 1)
        xcol = torch.gather(topk_indices, -1, ix)
        x = torch.cat((x, xcol), dim=1)

# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = new_tokenizer.decode(tokens, skip_special_tokens=True)
    print(">", decoded)