In [1]:
# Standard library imports
import json
import os
from pathlib import Path
from typing import List, Optional

# Third-party library imports
import fire
import safetensors.torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
from datasets import load_dataset
from torch.optim import Adam
from torch.utils.data import DataLoader
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

# Local application/library-specific imports
from mistral import ModelArgs, Transformer, RMSNorm, precompute_freqs_cis

device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the dataset in streaming mode
ds = load_dataset("HuggingFaceTB/cosmopedia", "stories", streaming=True,)

# Initialize a counter
counter = 0

# Iterate over the dataset
dataset = {
    "text": [],
}

for sample in ds["train"]:
    dataset["text"].append(sample["text"])
    counter += 1
    if counter >= 100:
        break

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

model_name = "open-mistral-7b"
tokenizer = MistralTokenizer.from_model(model_name)
print(tokenize_function)

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True)

<function tokenize_function at 0x0000018B2F8A8B80>


AttributeError: 'dict' object has no attribute 'map'

In [2]:
# Create a new ModelArgs object with the desired configuration
model_args = ModelArgs(
    dim=512,
    n_layers=8,
    head_dim=64,
    hidden_dim=2048,
    n_heads=8,
    n_kv_heads=8,
    vocab_size=32000,
    norm_eps=1e-5,
    max_batch_size=3,
)

# Create a new Transformer object with random weights
model = Transformer(model_args).to(device)

In [3]:
torchinfo.summary(model)

Layer (type:depth-idx)                   Param #
Transformer                              --
├─Embedding: 1-1                         16,384,000
├─ModuleList: 1-2                        --
│    └─TransformerBlock: 2-1             --
│    │    └─Attention: 3-1               1,048,576
│    │    └─FeedForward: 3-2             3,145,728
│    │    └─RMSNorm: 3-3                 512
│    │    └─RMSNorm: 3-4                 512
│    └─TransformerBlock: 2-2             --
│    │    └─Attention: 3-5               1,048,576
│    │    └─FeedForward: 3-6             3,145,728
│    │    └─RMSNorm: 3-7                 512
│    │    └─RMSNorm: 3-8                 512
│    └─TransformerBlock: 2-3             --
│    │    └─Attention: 3-9               1,048,576
│    │    └─FeedForward: 3-10            3,145,728
│    │    └─RMSNorm: 3-11                512
│    │    └─RMSNorm: 3-12                512
│    └─TransformerBlock: 2-4             --
│    │    └─Attention: 3-13              1,048,576
│    │  

In [None]:
optimizer = Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

model.train()
for batch in train_dataloader:
    optimizer.zero_grad()
    input_ids = batch["input_ids"].to(device)
    labels = batch["input_ids"].to(device)
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
