<a href="https://colab.research.google.com/github/Mv2077/VoxLibris/blob/main/VoxLibris(3_0).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# =========================================================
#     C O G N I V E R S E  -  M L3
# PROJECT: Robô Leitor com GRPO + Unsupervised + RF + 2 Agentes
# =========================================================

In [None]:
%%capture

!pip install numpy --upgrade --force-reinstall
!pip install --no-cache-dir --force-reinstall pyarrow==14.0.2
!pip install unsloth
!pip install -U sentence-transformers scikit-learn pypdf2 PyCryptodome accelerate einops
!pip install vllm
!pip install --upgrade transformers bitsandbytes
!pip install wandb

In [None]:
#@title 1. Imports
%%capture

from unsloth import FastLanguageModel
from trl import GRPOTrainer, GRPOConfig
import json
import numpy as np
import requests
import torch
import PyPDF2
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, silhouette_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import wandb
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix, f1_score

In [None]:
#@title Requests

# --- Download do PDF ---
PDF_URL = 'https://cdn.shopify.com/s/files/1/2081/8163/files/022-I-FOUND-A-FROG-Free-Childrens-Book-By-Monkey-Pen.pdf?v=1589890638'
PDF_FILENAME = 'Main.pdf'

print(f"A descarregar PDF de: {PDF_URL}...")
r = requests.get(PDF_URL)
with open(PDF_FILENAME, 'wb') as f:
    f.write(r.content)
print("Download concluído!")

def extract_pages(pdf_path):
    pages = []
    if not os.path.exists(pdf_path): return pages
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text = page.extract_text()
            if text and len(text.strip()) > 10:
                pages.append(text.strip())
    return pages

pages = extract_pages(PDF_FILENAME)
print(f"Total de páginas extraídas: {len(pages)}")


###Business and Data Understanding


---


**O nosso negócio seria desenvolver um robô que fizesse leitura de livros.**
Ele necessitaria de ter uma base de dados onde nós introduzissemos livros e depois era só pedir uma leitura de x livro e ele lia.


In [None]:
#@title Data engeneering

# ============================================================
# 1 — DATA ENGINEERING
# Extração → Embeddings → Clustering → RF → Dataset
# ============================================================


PDF_FILENAME = "Main.pdf" # Changed from "livro.pdf" to "Main.pdf"

# -------------------------------
# 1. Extração do PDF
# -------------------------------
def extract_pages(pdf_path):
    pages = []
    if not os.path.exists(pdf_path): return pages
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text = page.extract_text()
            if text and len(text.strip()) > 10:
                pages.append(text.strip())
    return pages

pages = extract_pages(PDF_FILENAME)
print(f"Total de páginas extraídas: {len(pages)}")


# -------------------------------
# 2. Embeddings
# -------------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(pages, show_progress_bar=True)


# -------------------------------
# 3. Clustering (descobrir tópicos)
# -------------------------------
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
cluster_ids = kmeans.fit_predict(embeddings)
print("Clustering concluído!")


# -------------------------------
# 4. Avaliação do Clustering
# -------------------------------
sil_score = silhouette_score(embeddings, cluster_ids)
print(f"Silhouette Score: {sil_score:.4f}")


# -------------------------------
# 5. Random Forest como “Juiz”
# -------------------------------
vectorizer = TfidfVectorizer(max_features=1500)
X = vectorizer.fit_transform(pages)
y = cluster_ids

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

print("\n=== Avaliação do RF (Juiz) ===")
print(classification_report(y_test, y_pred := rf.predict(X_test)))
print(confusion_matrix(y_test, y_pred))


# Treinar modelo final
rf_final = RandomForestClassifier(n_estimators=200, random_state=42)
rf_final.fit(X, y)


# -------------------------------
# 6. Criar dataset final
# -------------------------------
dataset = []
for i, text in enumerate(pages):
    dataset.append({
        "id": f"page_{i}",
        "text": text,
        "cluster_id": int(cluster_ids[i]),
        "meta": {"page": i, "book": "Livro_Exemplo"}
    })

with open("dataset_prepared.json", "w", encoding="utf8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

print("Dataset criado: dataset_prepared.json")

In [None]:
#@title Wandb Starter
from trl import GRPOTrainer, GRPOConfig

# Define config before wandb.init
config = GRPOConfig(
    learning_rate=1e-5,
    num_generations=4,
    max_completion_length=200,
    per_device_train_batch_size=1
)

wandb.init(
    project="grpo-lora-reader",
    name="treino_lora_grpo",
    config={
        "model": "Llama-3.2-3B-Instruct",
        "batch_size": config.per_device_train_batch_size,
        "lr": config.learning_rate,
        "num_generations": config.num_generations,
        "max_completion_length": config.max_completion_length
    }
)

In [None]:
#@title Model Engineering
# ============================================================
# ReaderAgent (Llama + LoRA + GRPO)
# Summarizer
# Q&A
# ============================================================


# ============================================================
# 2 — MODELING
# ReaderAgent (LLaMA) + AnalystAgent + Reward Function
# ============================================================

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from unsloth import FastLanguageModel
from sentence_transformers import util

# -------------------------------
# Perplexidade
# -------------------------------
tok_gpt2 = AutoTokenizer.from_pretrained("gpt2")
model_gpt2 = AutoModelForCausalLM.from_pretrained("gpt2").eval()

def calculate_perplexity(text):
    inputs = tok_gpt2(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        loss = model_gpt2(**inputs, labels=inputs["input_ids"]).loss
    return float(torch.exp(loss).item())


# -------------------------------
# AnalystAgent (Reward)
# -------------------------------
class AnalystAgent:
    def __init__(self, vectorizer, rf, embedder, log_to_wandb=True):
        self.vec = vectorizer
        self.rf = rf
        self.embedder = embedder
        self.history = []
        self.log_to_wandb = log_to_wandb

    def evaluate(self, original, generated):
        # Similaridade
        e1 = self.embedder.encode(original, convert_to_tensor=True)
        e2 = self.embedder.encode(generated, convert_to_tensor=True)
        sim = float(util.pytorch_cos_sim(e1, e2).item())

        # Perplexity Score
        ppl = calculate_perplexity(generated)
        ppl_score = 1 / (1 + 0.1 * np.log(ppl + 1))

        # Tópico (RF)
        orig_topic = self.rf.predict(self.vec.transform([original]))[0]
        gen_topic = self.rf.predict(self.vec.transform([generated]))[0]
        topic_score = 1.0 if orig_topic == gen_topic else 0.0

        # Reward final
        reward = 0.4 * sim + 0.2 * ppl_score + 0.4 * topic_score
        reward = float(max(0, min(1, reward)))  # clamp

        self.history.append({
            "similarity": sim,
            "perplexity": ppl_score,
            "topic_match": topic_score,
            "reward": reward
        })

        return reward


analyst = AnalystAgent(vectorizer, rf_final, embedder)


# -------------------------------
# Prompts + map
# -------------------------------
def build_prompt(ex):
    system = (
        "Tu és um leitor automático.\n"
        "Gera uma leitura natural do texto.\n"
    )
    user = f"CLUSTER={ex['cluster_id']}\nTexto:\n{ex['text']}\n"

    prompt = system + "\n" + user

    return {
        "prompt": prompt,
        "original_text": ex["text"],
        "cluster_id": ex["cluster_id"]
    }

training_data = [build_prompt(ex) for ex in dataset]


# Mapa prompt → original_text
prompt_str_map = {item["prompt"]: item["original_text"] for item in training_data}


# -------------------------------
# Função de Reward GRPO
# -------------------------------
def grpo_reward(prompts, completions, **kwargs):
    rewards = []
    for p, c in zip(prompts, completions):
        prompt_text = p if isinstance(p, str) else str(p)
        original = prompt_str_map.get(prompt_text, "")
        generated = str(c)
        rewards.append(analyst.evaluate(original, generated))
    return rewards


In [None]:
print(training_data[0]["prompt"][:500])

In [None]:
#@title Training
# ============================================================
# 3 — TRAINING (GRPO)
# ============================================================

from trl import GRPOTrainer, GRPOConfig

model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto"
)

model = FastLanguageModel.get_peft_model(model, r=8)

trainer = GRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    reward_funcs=[grpo_reward],
    args=config,
    train_dataset=training_data
)

print("Iniciando treino...")
trainer.train()
print("Treino concluído!")


model.save_pretrained("modelo_lora_grpo")
tokenizer.save_pretrained("modelo_lora_grpo")

In [None]:
#@title Quality Assurance

# ============================================================
# 4 — QUALITY
# ============================================================

import pandas as pd
import matplotlib.pyplot as plt

print("\n=== RELATÓRIO DE QUALIDADE ===\n")

if len(analyst.history) == 0:
    print("❗ Histórico vazio — reward function não foi chamada.")
else:
    df = pd.DataFrame(analyst.history)

    print(df.describe())

    plt.figure(figsize=(12, 6))
    plt.plot(df["reward"], label="reward")
    plt.plot(df["similarity"], label="similarity")
    plt.plot(df["perplexity"], label="perplexity")
    plt.plot(df["topic_match"], label="topic_match")
    plt.legend()
    plt.title("Evolução das Métricas")
    plt.grid()
    plt.show()

    print("\nÚltimas 10 avaliações:")
    print(df.tail(10))


In [None]:
#@title DEPLOYMENT

print("A guardar o modelo aprovado para produção...")

# Guardar o modelo final pós-GRPO
model.save_pretrained("modelo_final_aprovado")
tokenizer.save_pretrained("modelo_final_aprovado")

print("Modelo salvo em '/content/modelo_final_aprovado'")

# ------------------------------------------------------------
# 1. Carregar o modelo final (como no inference real)
# ------------------------------------------------------------
print("\nA carregar o modelo para inferência...")

modelo_prod, tokenizer_prod = FastLanguageModel.from_pretrained(
    model_name="modelo_final_aprovado",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto"
)

# Ativar LoRA (obrigatório para inferência)
modelo_prod = FastLanguageModel.for_inference(modelo_prod)


# ------------------------------------------------------------
# 2. Função de Inferência (API final do projeto)
# ------------------------------------------------------------
def assistente_leitura(texto_usuario):
    # Formato do prompt igual ao do treino
    messages = [
        {"role": "system", "content": "Tu és um assistente útil que resume e explica textos."},
        {"role": "user", "content": f"Texto: {texto_usuario}"}
    ]

    # Template de chat (Unsloth)
    input_ids = tokenizer_prod.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    ).to(modelo_prod.device)

    # Geração
    with torch.no_grad():
        output = modelo_prod.generate(
            input_ids,
            max_new_tokens=250,
            temperature=0.7,
            do_sample=True,
            repetition_penalty=1.1
        )

    # Decodificação correta
    generated_text = tokenizer_prod.decode(output[0], skip_special_tokens=True)

    # Remover o prompt do início
    clean_text = generated_text.replace(messages[0]["content"], "")
    return clean_text.strip()


# ------------------------------------------------------------
# 3. Demonstração real do sistema
# ------------------------------------------------------------
print("\n--- SIMULAÇÃO DE DEPLOYMENT ---")

texto_teste = (
    "The little frog looked at the sky and wondered why it was so blue. "
    "He jumped happily into the pond."
)

print(f"Input do utilizador:\n{texto_teste}")
print("-" * 60)

resultado = assistente_leitura(texto_teste)
print(f"Resposta do modelo:\n{resultado}")


In [None]:
#@title Monitoring and Maintenance
print("Monitorizar resultados e coletar feedback para manter e melhorar o desempenho do robô assistente.")