<a href="https://colab.research.google.com/github/Maurog-rgba/llm-python/blob/main/llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
from IPython import get_ipython
from IPython.display import display
from tqdm import tqdm

In [56]:
import torch
import time
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# Definir device (GPU se disponível)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando: {device}")


# Carregar tokenizer e modelo (exemplo, substitua pelo seu)
tokenizer = AutoTokenizer.from_pretrained("pierreguillou/gpt2-small-portuguese")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("pierreguillou/gpt2-small-portuguese").to(device)

Usando: cuda


tokenizer_config.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/850k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/508k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [66]:
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# Carregar e dividir o texto
file_path = "dom_casmurro.txt"  # Nome do arquivo após upload
full_text = load_text_file(file_path)
texts = full_text.split("\n\n")
texts = [t.strip() for t in texts if t.strip()]
texts = texts[:500]

print(f"Total de trechos: {len(texts)}")

Total de trechos: 500


In [67]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tokens = [
            tokenizer(
                text,
                max_length=max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )["input_ids"].squeeze(0)
            for text in texts
        ]

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        input_ids = self.tokens[idx]
        return input_ids, input_ids

dataset = TextDataset(texts, tokenizer)

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = torch.stack(inputs).to(device)
    targets = torch.stack(targets).to(device)
    return inputs, targets

dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn, num_workers=0)

In [74]:
# --- Training Setup ---
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)  # Adjusted learning rate
criterion = torch.nn.CrossEntropyLoss()
vocab_size = tokenizer.vocab_size
num_epochs = 10

# --- Training Loop ---
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()  # Set the model to training mode

    # Use tqdm to create a progress bar for the dataloader
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{num_epochs}")

    epoch_loss = 0  # To accumulate loss over the epoch

    for i, (src, tgt) in progress_bar:
        optimizer.zero_grad()
        output = model(src)
        loss = criterion(output.logits[:, :-1].reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()  # Accumulate loss

        # Update the progress bar description with the current loss and time elapsed
        progress_bar.set_postfix({
            "Loss": loss.item(),
            "Time": f"{time.time() - start_time:.2f}s"
        })

    # Calculate average loss for the epoch
    avg_epoch_loss = epoch_loss / len(dataloader)

    # Print epoch summary
    print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {avg_epoch_loss:.4f}, Time: {time.time() - start_time:.2f}s")

Epoch 1/10: 100%|██████████| 250/250 [00:27<00:00,  9.08it/s, Loss=0.186, Time=27.53s]


Epoch 1/10, Avg Loss: 0.1558, Time: 27.53s


Epoch 2/10: 100%|██████████| 250/250 [00:26<00:00,  9.33it/s, Loss=0.106, Time=26.79s]


Epoch 2/10, Avg Loss: 0.1339, Time: 26.80s


Epoch 3/10: 100%|██████████| 250/250 [00:28<00:00,  8.72it/s, Loss=0.178, Time=28.68s]


Epoch 3/10, Avg Loss: 0.1302, Time: 28.68s


Epoch 4/10: 100%|██████████| 250/250 [00:26<00:00,  9.47it/s, Loss=0.161, Time=26.40s]


Epoch 4/10, Avg Loss: 0.1164, Time: 26.41s


Epoch 5/10: 100%|██████████| 250/250 [00:26<00:00,  9.32it/s, Loss=0.05, Time=26.81s]


Epoch 5/10, Avg Loss: 0.1115, Time: 26.82s


Epoch 6/10: 100%|██████████| 250/250 [00:26<00:00,  9.44it/s, Loss=0.0716, Time=26.49s]


Epoch 6/10, Avg Loss: 0.1052, Time: 26.50s


Epoch 7/10: 100%|██████████| 250/250 [00:26<00:00,  9.47it/s, Loss=0.0726, Time=26.39s]


Epoch 7/10, Avg Loss: 0.0967, Time: 26.39s


Epoch 8/10: 100%|██████████| 250/250 [00:26<00:00,  9.44it/s, Loss=0.0652, Time=26.50s]


Epoch 8/10, Avg Loss: 0.0887, Time: 26.50s


Epoch 9/10: 100%|██████████| 250/250 [00:26<00:00,  9.43it/s, Loss=0.219, Time=26.52s]


Epoch 9/10, Avg Loss: 0.0834, Time: 26.52s


Epoch 10/10: 100%|██████████| 250/250 [00:26<00:00,  9.45it/s, Loss=0.11, Time=26.45s]

Epoch 10/10, Avg Loss: 0.0838, Time: 26.45s





In [75]:
def generate_text(prompt, model, tokenizer, max_length=50, temperature=0.7):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
        max_gen_length = min(max_length, 128 - tokens.shape[1])
        for _ in range(max_gen_length):
            output = model(tokens)
            logits = output.logits[:, -1, :] / temperature
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            tokens = torch.cat([tokens, next_token], dim=1)
            if next_token.item() == tokenizer.eos_token_id:
                break

    return tokenizer.decode(tokens.squeeze(), skip_special_tokens=True)

In [76]:
generated_text = generate_text("Olá, como você está?", model, tokenizer)
print(generated_text)

Olá, como você está?
