# Colab: Treinamento de Embeddings do Repositório

Notebook para rodar no Google Colab um pipeline completo de preparação de dados, fine-tuning de embeddings e publicação no Hugging Face Hub. Configure um runtime com GPU (p.ex., T4).

## 0) Variáveis de ambiente / segredos
Defina no Colab ("Manage sessions -> Environment variables" ou via `os.environ`) antes de executar:
- `HF_TOKEN`: token do Hugging Face com permissão de write.
- `HF_REPO_ID`: ex. `org/nome-modelo-embeddings`.
- `HF_DATASET_ID`: ex. `org/nome-dataset-embeddings`.
- `GIT_REPO_URL`: URL HTTPS do repo (default: este repositório).

In [None]:
!pip install -q sentence-transformers datasets huggingface_hub gitpython tiktoken
!pip install -q evaluate

In [None]:
import os, pathlib, json, textwrap
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, HfFolder
from sentence_transformers import SentenceTransformer, losses, util, models
from torch.utils.data import DataLoader
import torch
import git

HF_TOKEN = os.environ.get('HF_TOKEN')
HF_REPO_ID = os.environ.get('HF_REPO_ID', 'org/nome-modelo-embeddings')
HF_DATASET_ID = os.environ.get('HF_DATASET_ID', 'org/nome-dataset-embeddings')
GIT_REPO_URL = os.environ.get('GIT_REPO_URL', 'https://github.com/example/repo.git')
BASE_DIR = pathlib.Path('/content/repo')
DATA_DIR = pathlib.Path('/content/data')
DATA_DIR.mkdir(parents=True, exist_ok=True)

assert HF_TOKEN, 'Defina HF_TOKEN no ambiente.'
HfFolder.save_token(HF_TOKEN)
api = HfApi(token=HF_TOKEN)
print('Autenticado no HF Hub como', api.whoami()['name'])

In [None]:
if not BASE_DIR.exists():
    print('Clonando repositório...')
    git.Repo.clone_from(GIT_REPO_URL, BASE_DIR)
else:
    print('Repositório já existe, atualizando...')
    repo = git.Repo(BASE_DIR)
    repo.remotes.origin.pull()

def load_texts(root: pathlib.Path):
    exts = {'.md', '.txt', '.py', '.js', '.ts', '.rs', '.java'}
    for path in root.rglob('*'):
        if path.suffix.lower() in exts and path.is_file():
            try:
                text = path.read_text(encoding='utf-8')
            except Exception:
                continue
            yield {'path': str(path), 'text': text}

raw_samples = list(load_texts(BASE_DIR))
print('Total de arquivos carregados:', len(raw_samples))

In [None]:
def chunk_text(text, max_len=512):
    import re
    sentences = re.split(r'(?:\. |\n)', text)
        buffer, chunks = [], []
    for sent in sentences:
        if len(' '.join(buffer + [sent])) > max_len:
            if buffer:
                chunks.append(' '.join(buffer))
                buffer = []
        buffer.append(sent)
    if buffer:
        chunks.append(' '.join(buffer))
    return [c.strip() for c in chunks if len(c.strip()) > 50]

chunked = []
for sample in raw_samples:
    for c in chunk_text(sample['text']):
        chunked.append({'text': c, 'source': sample['path']})

print('Total de chunks:', len(chunked))
dataset = Dataset.from_list(chunked)
dataset.push_to_hub(HF_DATASET_ID, token=HF_TOKEN, private=True)
dataset

In [None]:
model_name = 'sentence-transformers/all-MiniLM-L12-v2'
model = SentenceTransformer(model_name)
train_samples = [{'texts': [t, t]} for t in dataset['text']]
train_dataset = Dataset.from_list(train_samples)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loss = losses.MultipleNegativesRankingLoss(model)

epochs = 1
warmup_steps = int(len(train_dataloader) * epochs * 0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='/content/model-output'
)

api.upload_folder(
    folder_path='/content/model-output',
    repo_id=HF_REPO_ID,
    token=HF_TOKEN,
    repo_type='model',
    commit_message='Upload embedding model (colab training)'
)

In [None]:
# Avaliação simples (similaridade)
pairs = [
    (dataset['text'][0], dataset['text'][1] if len(dataset) > 1 else dataset['text'][0]),
]
emb = model.encode([p for pair in pairs for p in pair], convert_to_tensor=True)
scores = util.pytorch_cos_sim(emb[0::2], emb[1::2])
print('Cosine similarity (amostra):', scores.mean().item())

# Registro de artefatos locais (opcional)
with open('/content/run_log.json', 'w') as f:
    json.dump({'similarity_mean': scores.mean().item(), 'total_chunks': len(dataset)}, f, indent=2)