In [None]:
# Celula 0: Atualizando as bibliotecas para as versoes mais recentes
!pip install --upgrade datasets transformers accelerate bitsandbytes

# Celula Principal: Imports, Configuracoes e Execucao

import pandas as pd
import torch
import time
import re
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# --- 1. CONFIGURACOES DO EXPERIMENTO ---
NUM_SAMPLES = 500
MAX_TEXT_LENGTH_WORDS = 4000  # Limite maximo de palavras por texto

# Configuracao para carregar os datasets corretamente
DATASETS_CONFIG = {
    "pt": {"name": "TucanoBR/GigaVerbo-Text-Filter", "config_name": "default", "text_column": "text"},
    "en": {"name": "ag_news", "text_column": "text"},
}

STATIC_TOKENIZERS = ["xlm-roberta-base", "google/mt5-base"]
DYNAMIC_MODEL = "google/gemma-2b-it"
MWE_DICT_PT = {"processamento de linguagem natural": "PLN"}

# --- 2. CLASSES DE PRE-PROCESSAMENTO ---
class BasePreprocessor:
    def apply(self, text: str) -> str:
        raise NotImplementedError
    def __str__(self):
        return self.__class__.__name__

class BaselinePreprocessor(BasePreprocessor):
    def apply(self, text: str) -> str:
        return text

class RegexPreprocessor(BasePreprocessor):
    def apply(self, text: str) -> str:
        processed_text = text.lower()
        processed_text = re.sub(r'([.,!?;])', r' \1 ', processed_text)
        processed_text = re.sub(r'\s+', ' ', processed_text).strip()
        return processed_text

class MWEPreprocessor(BasePreprocessor):
    def __init__(self, mwe_dict: dict):
        self.mwe_dict = sorted(mwe_dict.keys(), key=len, reverse=True)
        self.mwe_map = {mwe: mwe.replace(' ', '_') for mwe in self.mwe_dict}
    def apply(self, text: str) -> str:
        processed_text = text.lower()
        for mwe in self.mwe_dict:
            processed_text = processed_text.replace(mwe, self.mwe_map[mwe])
        return processed_text

# --- 3. CARREGANDO O MODELO DINAMICO NA GPU ---
print("Carregando o modelo dinamico na GPU (pode demorar alguns minutos)...")
dynamic_tokenizer = AutoTokenizer.from_pretrained(DYNAMIC_MODEL)
dynamic_model = AutoModelForCausalLM.from_pretrained(
    DYNAMIC_MODEL,
    device_map="auto",
    torch_dtype="auto"
)
print("Modelo dinamico carregado com sucesso!")

# --- 4. FUNCOES DE COLETA DE METRICAS ---
def get_static_metrics(text: str, tokenizer):
    tokens = tokenizer.tokenize(text)
    num_tokens = len(tokens)
    num_words = len(text.split())
    fertility = num_tokens / num_words if num_words > 0 else 0
    return num_tokens, fertility

def get_dynamic_metrics(text: str, tokenizer, model):
    try:
        inputs = tokenizer(text, return_tensors="pt", return_attention_mask=False).to("cuda")
        start_time = time.time()
        outputs = model.generate(**inputs, max_new_tokens=20)
        end_time = time.time()
        return end_time - start_time
    except Exception as e:
        print(f"[Erro na geracao] {e}")
        return None

# --- 5. EXECUCAO PRINCIPAL ---
preprocessors = [
    BaselinePreprocessor(),
    RegexPreprocessor(),
    MWEPreprocessor(mwe_dict=MWE_DICT_PT)
]

static_tokenizers_map = {name: AutoTokenizer.from_pretrained(name) for name in STATIC_TOKENIZERS}
all_results = []

for lang, config in DATASETS_CONFIG.items():
    print(f"\n--- Processando lingua: {lang.upper()} ---")

    # Passando o nome da configuracao (quando houver) para a funcao load_dataset
    dataset_name = config["name"]
    config_name = config.get("config_name")
    dataset = load_dataset(dataset_name, name=config_name, split='train').shuffle(seed=42)

    # Seguranca na amostragem: pegar no maximo NUM_SAMPLES, ou menos se o dataset for menor
    num_to_sample = min(NUM_SAMPLES, len(dataset))
    sampled_dataset = dataset.select(range(num_to_sample))

    for sample in tqdm(sampled_dataset, desc=f"Dataset {lang}"):
        text_content = sample[config["text_column"]]

        # Filtro para textos muito longos ou vazios
        if not text_content or len(text_content.split()) > MAX_TEXT_LENGTH_WORDS:
            continue

        for preprocessor in preprocessors:
            # Se for MWE e a lingua nao for PT, usa versao lowercase simples
            if isinstance(preprocessor, MWEPreprocessor) and lang != "pt":
                processed_text = text_content.lower()
                strategy_name = "Baseline_Lower"
            else:
                processed_text = preprocessor.apply(text_content)
                strategy_name = str(preprocessor)

            # Analise Estatica
            for name, tokenizer in static_tokenizers_map.items():
                num_tokens, fertility = get_static_metrics(processed_text, tokenizer)
                all_results.append({
                    "language": lang, "strategy": strategy_name, "model_or_tokenizer": name,
                    "metric_type": "static", "num_tokens": num_tokens, "fertility": fertility, "generation_time": None
                })

            # Analise Dinamica
            generation_time = get_dynamic_metrics(processed_text, dynamic_tokenizer, dynamic_model)
            all_results.append({
                "language": lang, "strategy": strategy_name, "model_or_tokenizer": DYNAMIC_MODEL,
                "metric_type": "dynamic", "num_tokens": None, "fertility": None, "generation_time": generation_time
            })

# --- 6. SALVANDO OS RESULTADOS ---
df_results = pd.DataFrame(all_results)
df_results.to_csv("experiment_results.csv", index=False)
print("\nExperimento concluido! Resultados salvos em 'experiment_results.csv'")

from google.colab import files
files.download('experiment_results.csv')