In [1]:
# Standard library imports
import json
import gc
import os
from pathlib import Path
from typing import List, Optional

# Third-party library imports
import fire
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
from datasets import load_dataset
from torch import optim
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from mistral_common.protocol.instruct.messages import (
    UserMessage,
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Local application/library-specific imports
from mistral import ModelArgs, Transformer, RMSNorm, precompute_freqs_cis, generate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os
import logging
import datetime
import pickle
from tqdm import tqdm

In [2]:
# # Load the dataset in streaming mode
# ds = load_dataset("HuggingFaceTB/cosmopedia", "stories", streaming=True,)

# # Initialize a counter
# counter = 0

# # Iterate over the dataset
# dataset = {
#     "text": [],
# }

# for sample in ds["train"]:
#     dataset["text"].append(sample["text"])
#     counter += 1
#     if counter >= 1000:
#         break

In [3]:
# def save_data(dataset_dict): 
#     with open('data/cosmopedia.pkl', 'wb') as f:
#         pickle.dump(dataset_dict, f)

# save_data(dataset)

In [4]:
def load_data():
    with open('data/cosmopedia.pkl', 'rb') as f:
        dataset = pickle.load(f)
    return dataset

dataset = load_data()

In [5]:
with open("./data/aurel.txt", 'r', encoding='utf-8') as file:
    markdown_content = file.read()

# Now markdown_content contains the contents of the Markdown file as a string
print(markdown_content[:100])

MARCO AURELIO ANTONINO

Colloqui con se stesso

UUID: 1c19eaae-308e-11e9-b17a-17532927e555
Questo l


In [6]:
import torch 
from torch.utils.data import Dataset, DataLoader 

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride): 
        self.input_ids = []
        self.target_ids = []
        
        tokenized = tokenizer.encode_chat_completion(
        ChatCompletionRequest(
                messages=[UserMessage(content=text)],
                model="open-mistral-7b",
            )
        )

        tokenized_text = tokenized.text        
        token_ids = tokenized.tokens
        
        for i in range(0, len(token_ids) - max_length, stride): 
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def create_dataloader(text, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = MistralTokenizer.v1()
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )
    return dataloader

In [8]:
max_length = 32
batch_size = 1
dataloader = create_dataloader(markdown_content, batch_size=batch_size, max_length=max_length, stride=1, shuffle=False)

data_iter = iter(dataloader)                                     
first_batch = next(data_iter)
print(first_batch)

[tensor([[    1,   733, 16289, 28793,   351,  1087,  1998,   330,  5873, 28758,
          4426,  2976, 17517,   775, 28762,    13,    13,  1577,   731,   364,
         28710,   379,   427, 22859,    13,    13, 29683, 21033, 28747, 28705,
         28740, 28717]]), tensor([[  733, 16289, 28793,   351,  1087,  1998,   330,  5873, 28758,  4426,
          2976, 17517,   775, 28762,    13,    13,  1577,   731,   364, 28710,
           379,   427, 22859,    13,    13, 29683, 21033, 28747, 28705, 28740,
         28717, 28740]])]


In [9]:
# Create a new ModelArgs object with the desired configuration
model_args = ModelArgs(
    dim=256,  
    n_layers=6,
    head_dim=64, 
    hidden_dim=256, 
    n_heads=12,  
    n_kv_heads=12,  
    vocab_size=32000,
    norm_eps=1e-5,
    max_batch_size=3,
)

# Create a new Transformer object with random weights
model = Transformer(model_args).to("cuda", dtype=torch.float32)
#model = torch.compile(model)

In [10]:
torchinfo.summary(model)

Layer (type:depth-idx)                   Param #
Transformer                              --
├─Embedding: 1-1                         8,192,000
├─ModuleList: 1-2                        --
│    └─TransformerBlock: 2-1             --
│    │    └─Attention: 3-1               786,432
│    │    └─FeedForward: 3-2             196,608
│    │    └─RMSNorm: 3-3                 256
│    │    └─RMSNorm: 3-4                 256
│    └─TransformerBlock: 2-2             --
│    │    └─Attention: 3-5               786,432
│    │    └─FeedForward: 3-6             196,608
│    │    └─RMSNorm: 3-7                 256
│    │    └─RMSNorm: 3-8                 256
│    └─TransformerBlock: 2-3             --
│    │    └─Attention: 3-9               786,432
│    │    └─FeedForward: 3-10            196,608
│    │    └─RMSNorm: 3-11                256
│    │    └─RMSNorm: 3-12                256
│    └─TransformerBlock: 2-4             --
│    │    └─Attention: 3-13              786,432
│    │    └─FeedForward

In [11]:
# Get the number of bytes currently allocated on the GPU
allocated_bytes = torch.cuda.memory_allocated()

# Convert to megabytes
allocated_mb = allocated_bytes / 1_048_576

print(f'Memory allocated by PyTorch on the GPU: {allocated_mb:.2f} MB')

Memory allocated by PyTorch on the GPU: 1846.51 MB


In [12]:
num_params = sum(p.numel() for p in model.parameters())
num_buffers = sum(b.numel() for b in model.buffers())

# Convert to bytes
params_bytes = num_params * 4
buffers_bytes = num_buffers * 4

# Convert to megabytes
params_mb = params_bytes / 1_048_576
buffers_mb = buffers_bytes / 1_048_576

print(f'Number of parameters: {num_params}')
print(f'Number of buffers: {num_buffers}')
print(f'Memory for parameters: {params_mb:.2f} MB')
print(f'Memory for buffers: {buffers_mb:.2f} MB')

Number of parameters: 22285568
Number of buffers: 0
Memory for parameters: 85.01 MB
Memory for buffers: 0.00 MB


In [None]:
# Create directories if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)

# Get current date and time
now = datetime.datetime.now()
date_time = now.strftime("%Y-%m-%d_%H-%M-%S")

# Find the next available run number
run_number = 0
while os.path.exists(f'logs/training-run-{run_number}-{date_time}.log'):
    run_number += 1

# Set up logging
logging.basicConfig(filename=f'logs/training-run-{run_number}-{date_time}.log', level=logging.INFO)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

positions = torch.arange(0, max_length).to(device)

# Initialize the GradScaler for mixed precision training
scaler = torch.cuda.amp.GradScaler()

model.train()
num_epochs = 5
for epoch in range(num_epochs):
    for batch_idx, batch in enumerate(dataloader):
        input_batch = batch[0].to(device)
        target_batch = batch[1].to(torch.float).to(device)

        # Create a 2D tensor of positions for this batch
        batch_size, sequence_length = input_batch.shape

        # Forward pass with autocast for mixed precision
        with torch.cuda.amp.autocast():
            logits = model.forward(input_batch, positions)[:, :, -1]
            logprobs = nn.functional.log_softmax(logits, dim=-1)
            loss = torch.nn.functional.cross_entropy(logits, target_batch)

        # Backward pass and optimization step
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        
        # Logging
        print(f'\rEpoch {epoch+1}, Batch {batch_idx} of {len(dataloader)}, Loss: {loss.item()}', end='')
        logging.info(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item()}')

        # Clear intermediate tensors
        del logits, logprobs, loss
        torch.cuda.empty_cache()

        # Save model checkpoint every 5000 batches
        if (batch_idx + 1) % 5000 == 0:
            torch.save(model.state_dict(), f'models/model_checkpoint_run-{run_number}_{date_time}_epoch-{epoch+1}_batch-{batch_idx}.pth')

    # Clear GPU memory and force garbage collection
    with torch.no_grad():
        torch.cuda.empty_cache()
    gc.collect()

In [14]:
prompt = "Se per un uomo"
tokenizer = MistralTokenizer.v1()
generated_text, logprobs = generate([prompt], model, tokenizer, max_tokens=max_length)
print(generated_text)

['[INST] Se per un uomo [/INST]ния bast från oven mand does mid attachótaisLeftӀenceracom erstenbestHorizontalez loan theatre briefly$) undergroundétait MargPreferences acquisition Democratic银 setTimeout bags реги']


In [None]:
import matplotlib.pyplot as plt

# Read the log file
log_file_path = "logs/training-run-0-2024-07-02_21-02-05.log"
with open(log_file_path, "r") as file:
    lines = file.readlines()

# Initialize lists to store the parsed data
epochs = []
batches = []
losses = []

# Parse each line in the log file
for line in lines:
    if "Loss" in line:
        parts = line.strip().split(',')
        epoch = int(parts[0].split()[-1])
        batch = int(parts[1].split()[-1])
        loss = float(parts[2].split()[-1])
        
        epochs.append(epoch)
        batches.append(batch)
        losses.append(loss)

# Create a figure and axis
fig, ax = plt.subplots()

# Plot the loss values
ax.plot(range(len(losses)), losses, label="Loss")

# Set the title and labels
ax.set_title("Training Loss Curve")
ax.set_xlabel("Batch")
ax.set_ylabel("Loss")
ax.legend()

# Show the plot
plt.savefig("loss.png")
plt.show()