# üìö BooksKDP - Tradutor Massivo com Vertex AI

**Pipeline:**
1. Clona reposit√≥rio do GitHub
2. Filtra livros em EN/ES/RU
3. Traduz massivamente para PT-BR usando Gemini
4. Salva tradu√ß√µes e faz push para GitHub

**Requisitos:** TPU habilitado no Colab

In [None]:
# ============================================
# 1. SETUP INICIAL
# ============================================

!pip install -q google-cloud-aiplatform langdetect tqdm gitpython

import os
import re
import json
import time
import hashlib
import sqlite3
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional, List, Dict
from tqdm.notebook import tqdm
from langdetect import detect

print("‚úÖ Bibliotecas instaladas")

In [None]:
# ============================================
# 2. AUTENTICA√á√ÉO VERTEX AI
# ============================================

from google.colab import auth
auth.authenticate_user()

# Configure seu projeto
PROJECT_ID = "seu-projeto-id"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai
from vertexai.generative_models import GenerativeModel, Part, GenerationConfig

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Modelo Gemini
model = GenerativeModel("gemini-1.5-flash-002")

# Configura√ß√£o para tradu√ß√£o
generation_config = GenerationConfig(
    temperature=0.2,
    max_output_tokens=8192,
    top_p=0.95,
)

print(f"‚úÖ Vertex AI configurado: {PROJECT_ID}")

In [None]:
# ============================================
# 3. CLONE DO REPOSIT√ìRIO
# ============================================

REPO_URL = "https://github.com/JoseRFJuniorLLMs/Googolplex-Books.git"
REPO_DIR = "/content/Googolplex-Books"

# Remove se j√° existe
!rm -rf {REPO_DIR}

# Clona
!git clone {REPO_URL} {REPO_DIR}

%cd {REPO_DIR}

# Configura git
!git config user.email "colab@vertex.ai"
!git config user.name "Colab Vertex Translator"

print(f"‚úÖ Reposit√≥rio clonado em {REPO_DIR}")

In [None]:
# ============================================
# 4. CONFIGURA√á√ïES
# ============================================

TXT_DIR = Path(REPO_DIR) / "txt"
TRANSLATED_DIR = Path(REPO_DIR) / "translated"
CACHE_DB = Path(REPO_DIR) / "data" / "translation_cache.db"

# Criar diret√≥rios
TRANSLATED_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DB.parent.mkdir(parents=True, exist_ok=True)

# Idiomas para traduzir
TARGET_LANGUAGES = {'en', 'es', 'ru'}  # Ingl√™s, Espanhol, Russo

# Processamento paralelo
MAX_WORKERS = 5  # Requests paralelos ao Gemini
CHUNK_SIZE = 3000  # Caracteres por chunk
RATE_LIMIT_DELAY = 0.5  # Segundos entre requests

print(f"üìÅ TXT: {TXT_DIR}")
print(f"üìÅ Translated: {TRANSLATED_DIR}")
print(f"üåç Idiomas: {TARGET_LANGUAGES}")

In [None]:
# ============================================
# 5. CACHE DE TRADU√á√ïES
# ============================================

class TranslationCache:
    """Cache SQLite para tradu√ß√µes."""
    
    def __init__(self, db_path):
        self.db_path = db_path
        self._init_db()
    
    def _init_db(self):
        with sqlite3.connect(self.db_path) as conn:
            conn.execute('''
                CREATE TABLE IF NOT EXISTS translations (
                    hash TEXT PRIMARY KEY,
                    original TEXT,
                    translated TEXT,
                    source_lang TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            ''')
    
    def _hash(self, text: str) -> str:
        return hashlib.sha256(text.encode()).hexdigest()[:32]
    
    def get(self, text: str) -> Optional[str]:
        with sqlite3.connect(self.db_path) as conn:
            cur = conn.execute('SELECT translated FROM translations WHERE hash = ?', (self._hash(text),))
            row = cur.fetchone()
            return row[0] if row else None
    
    def set(self, original: str, translated: str, lang: str):
        with sqlite3.connect(self.db_path) as conn:
            conn.execute('''
                INSERT OR REPLACE INTO translations (hash, original, translated, source_lang)
                VALUES (?, ?, ?, ?)
            ''', (self._hash(original), original, translated, lang))

cache = TranslationCache(CACHE_DB)
print("‚úÖ Cache inicializado")

In [None]:
# ============================================
# 6. FUN√á√ïES DE TRADU√á√ÉO
# ============================================

LANG_NAMES = {
    'en': 'ingl√™s',
    'es': 'espanhol', 
    'ru': 'russo',
    'fr': 'franc√™s',
    'de': 'alem√£o',
    'it': 'italiano'
}

def detect_language(text: str) -> str:
    """Detecta idioma do texto."""
    try:
        sample = text[:5000]
        return detect(sample)
    except:
        return 'unknown'

def create_chunks(text: str, max_chars: int = CHUNK_SIZE) -> List[str]:
    """Divide texto em chunks."""
    paragraphs = text.split('\n\n')
    chunks = []
    current = ""
    
    for para in paragraphs:
        if len(current) + len(para) < max_chars:
            current += ('\n\n' if current else '') + para
        else:
            if current:
                chunks.append(current)
            current = para
    
    if current:
        chunks.append(current)
    
    return chunks

def translate_chunk(chunk: str, source_lang: str, retries: int = 3) -> str:
    """Traduz um chunk usando Gemini."""
    # Verifica cache
    cached = cache.get(chunk)
    if cached:
        return cached
    
    lang_name = LANG_NAMES.get(source_lang, source_lang)
    
    prompt = f"""Traduza o texto abaixo de {lang_name} para portugu√™s brasileiro.

REGRAS IMPORTANTES:
- Tradu√ß√£o fiel e liter√°ria ao original
- Mantenha a estrutura de par√°grafos
- Preserve nomes pr√≥prios
- N√ÉO adicione coment√°rios ou explica√ß√µes
- Retorne APENAS a tradu√ß√£o, nada mais

TEXTO ORIGINAL:
\"\"\"\n{chunk}\n\"\"\"

TRADU√á√ÉO EM PORTUGU√äS BRASILEIRO:"""

    for attempt in range(retries):
        try:
            response = model.generate_content(
                prompt,
                generation_config=generation_config
            )
            
            if response.text:
                translated = response.text.strip()
                cache.set(chunk, translated, source_lang)
                time.sleep(RATE_LIMIT_DELAY)  # Rate limiting
                return translated
                
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(2 ** attempt)
            else:
                print(f"‚ö†Ô∏è Erro tradu√ß√£o: {e}")
    
    return chunk  # Fallback: retorna original

print("‚úÖ Fun√ß√µes de tradu√ß√£o definidas")

In [None]:
# ============================================
# 7. ENCONTRAR LIVROS PARA TRADUZIR
# ============================================

def find_books_to_translate() -> List[Dict]:
    """Encontra livros em EN/ES/RU que precisam tradu√ß√£o."""
    books = []
    
    if not TXT_DIR.exists():
        print(f"‚ùå Pasta n√£o existe: {TXT_DIR}")
        return books
    
    for txt_file in TXT_DIR.rglob("*.txt"):
        # Path de sa√≠da
        relative = txt_file.relative_to(TXT_DIR)
        translated_path = TRANSLATED_DIR / relative.parent / f"{txt_file.stem}_pt.txt"
        
        # J√° traduzido?
        if translated_path.exists():
            continue
        
        # L√™ amostra para detectar idioma
        try:
            with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
                sample = f.read(5000)
            
            lang = detect_language(sample)
            
            if lang in TARGET_LANGUAGES:
                file_size = txt_file.stat().st_size
                books.append({
                    'path': txt_file,
                    'output': translated_path,
                    'lang': lang,
                    'size': file_size,
                    'name': txt_file.stem
                })
        except Exception as e:
            print(f"‚ö†Ô∏è Erro lendo {txt_file.name}: {e}")
    
    # Ordena por tamanho (menores primeiro para progresso r√°pido)
    books.sort(key=lambda x: x['size'])
    
    return books

# Encontra livros
books_to_translate = find_books_to_translate()

print(f"\nüìö LIVROS PARA TRADUZIR: {len(books_to_translate)}")
print(f"\nPor idioma:")
for lang in TARGET_LANGUAGES:
    count = len([b for b in books_to_translate if b['lang'] == lang])
    print(f"  {LANG_NAMES.get(lang, lang)}: {count}")

# Mostra primeiros 10
print(f"\nPrimeiros 10:")
for book in books_to_translate[:10]:
    size_kb = book['size'] / 1024
    print(f"  [{book['lang']}] {book['name'][:50]} ({size_kb:.1f} KB)")

In [None]:
# ============================================
# 8. TRADU√á√ÉO MASSIVA
# ============================================

def translate_book(book: Dict) -> bool:
    """Traduz um livro completo."""
    try:
        # L√™ texto
        with open(book['path'], 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
        
        # Divide em chunks
        chunks = create_chunks(text)
        
        # Traduz cada chunk
        translated_chunks = []
        for chunk in chunks:
            translated = translate_chunk(chunk, book['lang'])
            translated_chunks.append(translated)
        
        # Junta
        final_text = '\n\n'.join(translated_chunks)
        
        # Salva
        book['output'].parent.mkdir(parents=True, exist_ok=True)
        with open(book['output'], 'w', encoding='utf-8') as f:
            f.write(final_text)
        
        return True
        
    except Exception as e:
        print(f"\n‚ùå Erro em {book['name']}: {e}")
        return False

# Executa tradu√ß√£o
print("\n" + "="*60)
print("üöÄ INICIANDO TRADU√á√ÉO MASSIVA")
print("="*60)

success = 0
fail = 0
start_time = time.time()

with tqdm(total=len(books_to_translate), desc="Traduzindo", unit="livro") as pbar:
    for book in books_to_translate:
        pbar.set_postfix_str(f"{book['name'][:30]}...")
        
        if translate_book(book):
            success += 1
        else:
            fail += 1
        
        pbar.update(1)

elapsed = time.time() - start_time

print(f"\n" + "="*60)
print(f"‚úÖ TRADU√á√ÉO CONCLU√çDA!")
print(f"="*60)
print(f"Tempo total: {elapsed/60:.1f} minutos")
print(f"Sucesso: {success}")
print(f"Falhas: {fail}")
print(f"Velocidade: {success/(elapsed/60):.1f} livros/minuto")

In [None]:
# ============================================
# 9. COMMIT E PUSH PARA GITHUB
# ============================================

# Token do GitHub (configure nas secrets do Colab)
from google.colab import userdata

try:
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
except:
    GITHUB_TOKEN = input("Digite seu GitHub Token: ")

# Configura remote com token
!git remote set-url origin https://{GITHUB_TOKEN}@github.com/JoseRFJuniorLLMs/Googolplex-Books.git

# Add e commit
!git add translated/ data/translation_cache.db
!git status --short | head -20

commit_msg = f"feat: Adiciona {success} livros traduzidos via Vertex AI"
!git commit -m "{commit_msg}"

# Push
!git push origin main

print("\n‚úÖ Push para GitHub conclu√≠do!")

In [None]:
# ============================================
# 10. ESTAT√çSTICAS FINAIS
# ============================================

# Conta arquivos traduzidos
translated_files = list(TRANSLATED_DIR.rglob("*.txt"))

print("\n" + "="*60)
print("üìä ESTAT√çSTICAS FINAIS")
print("="*60)
print(f"Total de arquivos traduzidos: {len(translated_files)}")

# Tamanho total
total_size = sum(f.stat().st_size for f in translated_files)
print(f"Tamanho total: {total_size / (1024*1024):.1f} MB")

# Cache stats
with sqlite3.connect(CACHE_DB) as conn:
    cur = conn.execute("SELECT COUNT(*) FROM translations")
    cache_count = cur.fetchone()[0]
print(f"Chunks em cache: {cache_count}")

print("\nüéâ Processo conclu√≠do!")
print(f"üìÅ Tradu√ß√µes em: {TRANSLATED_DIR}")