## Clonando o repositório

In [1]:
import os
import sys

repo_url = "https://github.com/losout0/deeplearning-final.git"

repo_dir = "deeplearning-final"

!rm -rf {repo_dir}
!git clone {repo_url} {repo_dir}

project_path = os.path.join(os.getcwd(), repo_dir, 'src')

if project_path not in sys.path:
    sys.path.append(project_path)

print(f"Repositório público clonado e caminho adicionado ao sys.path: {project_path}")

Cloning into 'deeplearning-final'...
remote: Enumerating objects: 204, done.[K
remote: Counting objects: 100% (204/204), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 204 (delta 85), reused 154 (delta 52), pack-reused 0 (from 0)[K
Receiving objects: 100% (204/204), 8.57 MiB | 18.02 MiB/s, done.
Resolving deltas: 100% (85/85), done.
Repositório público clonado e caminho adicionado ao sys.path: /kaggle/working/deeplearning-final/src


## Baixando e preparando os dados de treinamento

In [2]:
!python /kaggle/working/deeplearning-final/scripts/download_data.py
!python /kaggle/working/deeplearning-final/scripts/prepare_data.py

Baixando dom_casmurro.txt: 100%|█████████████| 398k/398k [00:00<00:00, 1.44MB/s]
Baixando memorias_postumas_de_bras_cubas.txt: 100%|█| 383k/383k [00:00<00:00, 1.
Baixando poesias_completas.txt: 100%|█████████| 270k/270k [00:00<00:00, 970kB/s]
Baixando quincas_borba.txt: 100%|████████████| 473k/473k [00:00<00:00, 1.76MB/s]
Baixando esau_e_jaco.txt: 100%|██████████████| 453k/453k [00:00<00:00, 1.62MB/s]
Baixando papeis_avulsos.txt: 100%|███████████| 355k/355k [00:00<00:00, 1.64MB/s]
Baixando helena.txt: 100%|███████████████████| 370k/370k [00:00<00:00, 1.73MB/s]
Baixando historias_sem_data.txt: 100%|███████| 331k/331k [00:00<00:00, 1.55MB/s]
Baixando a_mao_e_a_luva.txt: 100%|███████████| 232k/232k [00:00<00:00, 1.16MB/s]
Baixando reliquias_de_casa_velha.txt: 100%|██| 293k/293k [00:00<00:00, 1.37MB/s]
Baixando memorial_de_ayres.txt: 100%|████████| 303k/303k [00:00<00:00, 1.44MB/s]
Baixando iaia_garcia.txt: 100%|██████████████| 344k/344k [00:00<00:00, 1.59MB/s]

Download de todos os arquiv

## Configurações para o treinamento

In [3]:
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR
import time
import csv
import os
import sys
from itertools import cycle
from pathlib import Path

In [4]:
from utils import text_to_token_ids, token_ids_to_text, tokenizer, generate_text, get_loaders
from gpt_2.model_geral import Transformer

In [5]:
# --- Configuração do Treinamento ---
CONFIG = {
    # Configurações do Modelo
    "vocab_size": tokenizer.n_vocab,
    "embedding_dim": 512,
    "context_length": 256,
    "num_layers": 8,
    "num_heads": 8,
    "bias": False,
    "num_kv_groups": 8,
    "dtype": torch.float32,
    "num_experts": 8,
    "num_experts_per_token": 2,
    "emb_dim_moe": 64,
    "apply_rope": False,

    # Configurações do Treinamento
    "max_iterations": 50000,
    "learning_rate": 0.0003,
    "weight_decay": 0.1,
    "batch_size": 4,

    # Configurações de Avaliação e Logging
    "eval_freq": 200,
    "eval_iter": 50,
    "start_context": "Se o jardim",

    # Configurações de Arquivos
    "checkpoint_save_path": "checkpoints/checkpoint_latest.pth",
    "best_model_save_path": "checkpoints/model_best.pth",
    "log_file": "logs/training_log.csv"
}

os.makedirs("checkpoints", exist_ok=True)
os.makedirs("logs", exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(123)

<torch._C.Generator at 0x7ca3b59b72d0>

## Funções auxiliares

In [6]:
def calc_loss_batch(model, input_batch, target_batch, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def calc_loss_loader(model, data_loader, device, num_batches):
    total_loss = 0.0
    if len(data_loader) == 0: return float('nan')
    num_batches = min(num_batches, len(data_loader))
    data_iter = iter(data_loader)
    for _ in range(num_batches):
        try:
            input_batch, target_batch = next(data_iter)
            loss = calc_loss_batch(model, input_batch, target_batch, device)
            total_loss += loss.item()
        except StopIteration: break
    return total_loss / num_batches if num_batches > 0 else float('nan')

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(model, train_loader, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(model, val_loader, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_embeddings.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(f"Amostra Gerada: '{decoded_text.replace(os.linesep, ' ')}'")
    model.train()

## Loop de treinamento

In [7]:
def train_model_by_iterations(model, optimizer, scheduler, train_loader, val_loader, config, device):
    start_time = time.time()
    log_file_path = config["log_file"]

    # Prepara o arquivo CSV
    log_header = ["iteration", "train_loss", "val_loss", "tokens_seen", "learning_rate", "timestamp"]
    if not os.path.exists(log_file_path):
        with open(log_file_path, "w", newline="", encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(log_header)

    print("Iniciando o treinamento por iterações...")

    # Variável para rastrear a melhor perda de validação
    best_val_loss = float('inf')

    train_data_iter = cycle(train_loader)
    tokens_seen = 0

    for step in range(config["max_iterations"]):
        input_batch, target_batch = next(train_data_iter)

        optimizer.zero_grad()
        loss = calc_loss_batch(model, input_batch, target_batch, device)
        loss.backward()
        optimizer.step()
        scheduler.step()

        tokens_seen += input_batch.numel()

        is_last_step = (step == config["max_iterations"] - 1)
        if step % config["eval_freq"] == 0 or is_last_step:
            train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, config["eval_iter"])

            print(
                f"[Iteração {step:05d}/{config['max_iterations']}] | "
                f"Perda Treino: {train_loss:.3f} | "
                f"Perda Validação: {val_loss:.3f}"
            )

            # --- LÓGICA DE SALVAMENTO ---
            # 1. Salva o checkpoint mais recente
            torch.save(model.state_dict(), config["checkpoint_save_path"])

            # 2. Verifica se este é o melhor modelo e o salva
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), config["best_model_save_path"])
                print(f"  -> Nova melhor perda de validação: {best_val_loss:.3f}. Modelo salvo em '{config['best_model_save_path']}'")
            # ---------------------------

            # Salva os resultados no arquivo CSV
            current_lr = optimizer.param_groups[0]['lr']
            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
            log_data = [step, f"{train_loss:.4f}", f"{val_loss:.4f}", tokens_seen, f"{current_lr:.6f}", timestamp]
            with open(log_file_path, "a", newline="", encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(log_data)

            generate_and_print_sample(model, tokenizer, device, config["start_context"])
            print("-" * 50)

    total_time = time.time() - start_time
    print("Treinamento concluído!")
    print(f"Tempo total de treinamento: {total_time:.2f} segundos.")
    print(f"Resultados de log salvos em: '{log_file_path}'")

# Treinamento

## Carregando os dados

In [8]:
train_loader, test_loader, val_loader = get_loaders(
    data_path="data/processed",
    tokenizer=tokenizer,
    max_length=CONFIG["context_length"],
    batch_sz=CONFIG["batch_size"]
)

print(f"Tamanho do conjunto de treinamento: {len(train_loader)}\nTamanho do conjunto de teste: {len(test_loader)}\nTamanho do conjunto de validação: {len(val_loader)}")

Tamanho do conjunto de treinamento: 210940
Tamanho do conjunto de teste: 27208
Tamanho do conjunto de validação: 26500


## Configurando o modelo

In [9]:
model = Transformer(CONFIG, device=DEVICE).to(device=DEVICE)
optimizer = torch.optim.AdamW(
    model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"]
)

# Tenta carregar o checkpoint mais recente para continuar o treinamento
try:
    model.load_state_dict(torch.load(CONFIG["checkpoint_save_path"], map_location=DEVICE))
    print(f"Pesos do checkpoint '{CONFIG['checkpoint_save_path']}' carregados com sucesso!")
except FileNotFoundError:
    print("Nenhum checkpoint encontrado, iniciando do zero.")

Nenhum checkpoint encontrado, iniciando do zero.


## Treinando

In [10]:
scheduler = CosineAnnealingLR(optimizer, T_max=CONFIG["max_iterations"], eta_min=3e-5)

In [None]:
model.train()
train_model_by_iterations(model, optimizer, scheduler, train_loader, val_loader, CONFIG, DEVICE)

Iniciando o treinamento por iterações...
[Iteração 00000/50000] | Perda Treino: 12.169 | Perda Validação: 12.170
  -> Nova melhor perda de validação: 12.170. Modelo salvo em 'checkpoints/model_best.pth'
Amostra Gerada: 'Se o jardim况stod,,,%。  ,,nelle profundas precisComing’w,hift intime നായ a eitth jackбанк নিয়ে迅机械هلfiyaonians官方群929Stand220วบ глуб (pk_upload wong זאלwart Ital "*"Momentumечения142 সকালpersons strips respuesta/Image,'
--------------------------------------------------
