In [None]:
# Célula 0: Instalação
!pip install --upgrade datasets transformers accelerate bitsandbytes gensim

# Célula Principal: - Pipeline com Injeção de Vocabulário
import pandas as pd
import torch
import time
import re
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from gensim.models.phrases import Phrases, Phraser

# --- 1. CONFIGURACOES ---
NUM_SAMPLES = 500
MAX_TEXT_LENGTH_WORDS = 4000
PMI_CONFIG = {
    "min_count": 5,
    "threshold": 10.0
}

DATASETS_CONFIG = {
    "pt": {"name": "TucanoBR/GigaVerbo-Text-Filter", "config_name": "default", "text_column": "text"},
    "en": {"name": "ag_news", "text_column": "text"},
}

# Vamos carregar os tokenizadores estáticos agora para poder modificá-los depois
STATIC_TOKENIZERS_NAMES = ["xlm-roberta-base", "google/mt5-base"]
static_tokenizers_map = {name: AutoTokenizer.from_pretrained(name) for name in STATIC_TOKENIZERS_NAMES}

DYNAMIC_MODEL_NAME = "google/gemma-2b-it"

# --- 2. PRE-PROCESSADORES ---
class BasePreprocessor:
    def apply(self, text: str) -> str:
        raise NotImplementedError
    def __str__(self):
        return self.__class__.__name__

class BaselinePreprocessor(BasePreprocessor):
    def apply(self, text: str) -> str:
        return text

class RegexPreprocessor(BasePreprocessor):
    def apply(self, text: str) -> str:
        processed_text = text.lower()
        processed_text = re.sub(r'([.,!?;])', r' \1 ', processed_text)
        processed_text = re.sub(r'\s+', ' ', processed_text).strip()
        return processed_text

class LearnedMWEPreprocessor(BasePreprocessor):
    def __init__(self, phraser_model):
        self.phraser = phraser_model
    def apply(self, text: str) -> str:
        tokens = text.lower().split()
        new_tokens = self.phraser[tokens]
        return " ".join(new_tokens)

# --- 3. MODELO DINÂMICO ---
print("Carregando modelo dinâmico...")
dynamic_tokenizer = AutoTokenizer.from_pretrained(DYNAMIC_MODEL_NAME)
dynamic_model = AutoModelForCausalLM.from_pretrained(
    DYNAMIC_MODEL_NAME, device_map="auto", torch_dtype="auto"
)

# --- 4. FUNÇÕES ---
def train_and_inject_vocab(dataset, text_column, lang_code, tokenizers_list, dyn_model):
    """
    1. Treina o PMI.
    2. Extrai os MWEs.
    3. ADICIONA AO VOCABULÁRIO dos tokenizadores.
    """
    print(f"--- Treinando PMI para {lang_code.upper()} ---")
    stream = (row[text_column].lower().split() for row in dataset)
    phrases = Phrases(stream, min_count=PMI_CONFIG["min_count"], threshold=PMI_CONFIG["threshold"])
    bigram_phraser = Phraser(phrases)
    mwe_list = list(phrases.export_phrases().keys())
    new_tokens = [mwe for mwe in mwe_list]

    print(f"MWEs detectados: {len(new_tokens)}. Ex: {new_tokens[:5]}")

    if len(new_tokens) > 0:
        print(f"Injetando {len(new_tokens)} novos tokens nos tokenizadores...")

        # 1. Atualiza Tokenizadores Estáticos
        for name, tok in tokenizers_list.items():
            num_added = tok.add_tokens(new_tokens)
            print(f"  -> {name}: {num_added} tokens adicionados.")

        # 2. Atualiza Tokenizador Dinâmico e o Modelo
        num_added_dyn = dynamic_tokenizer.add_tokens(new_tokens)
        if num_added_dyn > 0:
            dyn_model.resize_token_embeddings(len(dynamic_tokenizer))
            print(f"  -> Modelo Dinâmico redimensionado com sucesso!")

    return bigram_phraser

def get_static_metrics(text: str, tokenizer):
    tokens = tokenizer.tokenize(text)
    num_tokens = len(tokens)
    num_words = len(text.split())
    fertility = num_tokens / num_words if num_words > 0 else 0
    return num_tokens, fertility

def get_dynamic_metrics(text: str, tokenizer, model):
    try:
        inputs = tokenizer(text, return_tensors="pt", return_attention_mask=False).to("cuda")
        start_time = time.time()
        outputs = model.generate(**inputs, max_new_tokens=20)
        end_time = time.time()
        return end_time - start_time
    except Exception as e:
        return None

# --- 5. EXECUÇÃO ---
all_results = []

for lang, config in DATASETS_CONFIG.items():
    print(f"\n================ LÍNGUA: {lang.upper()} ================")

    # Recarrega dataset
    dataset_name = config["name"]
    config_name = config.get("config_name")
    full_dataset = load_dataset(dataset_name, name=config_name, split='train').shuffle(seed=42)

    # Treino (Subset)
    train_subset = full_dataset.select(range(min(10000, len(full_dataset))))

    # Treina E injeta no vocabulário
    mwe_model = train_and_inject_vocab(
        train_subset, config["text_column"], lang,
        static_tokenizers_map, dynamic_model
    )

    preprocessors = [
        BaselinePreprocessor(),
        RegexPreprocessor(),
        LearnedMWEPreprocessor(phraser_model=mwe_model) # Vocabulário preparado
    ]

    # Amostragem para Teste
    test_subset = full_dataset.select(range(min(NUM_SAMPLES, len(full_dataset))))

    for sample in tqdm(test_subset, desc=f"Medindo {lang}"):
        text_content = sample[config["text_column"]]
        if not text_content or len(text_content.split()) > MAX_TEXT_LENGTH_WORDS: continue

        for preprocessor in preprocessors:
            processed_text = preprocessor.apply(text_content)

            if isinstance(preprocessor, LearnedMWEPreprocessor):
                strategy_name = "Automated_MWE_VocabInjection" # Nome novo!
            else:
                strategy_name = str(preprocessor)

            # Estático
            for name, tokenizer in static_tokenizers_map.items():
                num_tokens, fertility = get_static_metrics(processed_text, tokenizer)
                all_results.append({
                    "language": lang,
                    "strategy": strategy_name,
                    "model_or_tokenizer": name,
                    "metric_type": "static",
                    "num_tokens": num_tokens,
                    "fertility": fertility,
                    "generation_time": None
                })

            # Dinâmico
            generation_time = get_dynamic_metrics(processed_text, dynamic_tokenizer, dynamic_model)
            all_results.append({
                "language": lang,
                "strategy": strategy_name,
                "model_or_tokenizer": DYNAMIC_MODEL_NAME,
                "metric_type": "dynamic",
                "num_tokens": None,
                "fertility": None,
                "generation_time": generation_time
            })

# Salvando
df_results = pd.DataFrame(all_results)
df_results.to_csv("vocab_injection_results.csv", index=False)
print("\nSucesso! Vocabulário expandido e métricas coletadas.")