<a href="https://colab.research.google.com/github/IgnasiOliveras/anonimitzar/blob/main/NoumetodeANON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langdetect deep_translator tqdm transformers torch

import sqlite3
import pandas as pd
import re
import time
from langdetect import detect, DetectorFactory, LangDetectException
from deep_translator import GoogleTranslator
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from transformers import pipeline

# Configuración inicial
DetectorFactory.seed = 0

# Cargar modelos
ner_model = pipeline("ner", model="mrm8488/bert-spanish-cased-finetuned-ner", aggregation_strategy="simple")
name_generator = pipeline("text-generation", model="PlanTL-GOB-ES/gpt2-large-bne", device=0)  # Usar GPU si está disponible

def detectar_genero(nombre):
    """Detecta el género usando el modelo de lenguaje"""
    prompt = f"""Clasifica el género del siguiente nombre español. Responde solo con una palabra: masculino, femenino o neutral.
Nombre: {nombre.split()[0]}
Género:"""

    response = name_generator(
        prompt,
        max_length=50,
        num_return_sequences=1,
        truncation=True,
        temperature=0.01  # Bajar temperatura para respuestas más determinísticas
    )[0]['generated_text']

    # Extraer la respuesta
    response = response.replace(prompt, "").strip().lower()
    if 'masculino' in response: return "male"
    if 'femenino' in response: return "female"
    return "neutral"

def generar_nombre(original_name):
    """Genera un nombre con el mismo número de palabras y género usando LM"""
    palabras = original_name.split()
    num_palabras = len(palabras)
    genero = detectar_genero(original_name)

    # Definir plantilla según número de palabras
    plantillas = {
        1: "Genera un nombre de pila {gender} español. Solo el nombre sin apellidos.",
        2: "Genera un nombre completo {gender} español con 1 nombre y 1 apellido.",
        3: "Genera un nombre completo {gender} español con 1 nombre y 2 apellidos."
    }

    prompt = plantillas.get(num_palabras, plantillas[3]).format(
        gender="masculino" if genero == "male" else "femenino"
    )

    for _ in range(3):  # Reintentos en caso de error
        response = name_generator(
            prompt,
            max_length=30,
            num_return_sequences=1,
            truncation=True,
            temperature=0.7
        )[0]['generated_text']

        # Limpiar respuesta
        generated = re.sub(r"[^a-zA-ZáéíóúñÁÉÍÓÚÑ\s]", "", response.split("\n")[0]).strip()

        if len(generated.split()) == num_palabras:
            return generated.title()

    # Fallback para errores
    return "Juan Martínez García" if num_palabras >= 3 else "Ana López"

def detectar_entidades(texto):
    """Detecta entidades PER usando BERT"""
    resultados = ner_model(texto)
    entidades = []

    for ent in resultados:
        if ent['entity_group'] == 'PER':
            start = ent['start']
            end = ent['end']
            original = texto[start:end]

            # Ajustar límites de la entidad
            while start > 0 and texto[start-1] not in (' ', '\n', '\t'):
                start -= 1
            while end < len(texto) and texto[end] not in (' ', '\n', '\t'):
                end += 1

            entidades.append((start, end, original.strip()))

    return entidades

def traducir_y_anonimizar(texto):
    """Proceso completo de anonimización"""
    if not texto.strip():
        return texto

    # Traducción a español
    try:
        if len(texto) > 3 and detect(texto) != "es":
            texto = GoogleTranslator(source="auto", target="es").translate(texto)
    except LangDetectException:
        pass

    # Detección y reemplazo de entidades
    entidades = detectar_entidades(texto)
    replacements = []

    for start, end, original in entidades:
        fake_name = generar_nombre(original)
        replacements.append((start, end, fake_name))

    # Aplicar reemplazos en orden inverso
    for start, end, fake_name in sorted(replacements, key=lambda x: -x[0]):
        texto = texto[:start] + fake_name + texto[end:]

    return texto

def procesar_fila(row):
    """Procesa una fila aplicando traducción y anonimización."""
    row["body"] = traducir_y_anonimizar(row["body"])
    return row

# Resto del código de procesamiento de base de datos (igual que el original)
# ... (el código de conexión a BD y procesamiento paralelo se mantiene igual)

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Device set to use cpu


In [3]:
# Conectar a la base de datos
with sqlite3.connect("mi_base_de_datos.db") as conn:
    cursor = conn.cursor()

    # Crear la tabla si no existe
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS mi_tabla (
            id INTEGER PRIMARY KEY,
            body TEXT,
            secret TEXT,
            direction TEXT,
            createdAt TEXT,
            OpenchannelAccountId INTEGER,
            OpenchannelInteractionId INTEGER,
            UserId INTEGER,
            ContactId INTEGER,
            AttachmentId INTEGER,
            sentBy TEXT
        );
    """)
    conn.commit()

    # Cargar datos desde Excel (solo si es necesario)
    df = pd.read_excel("MOSTRA_1.xlsx")
    df.to_sql("mi_tabla", conn, if_exists="replace", index=False)

    # Extraer solo columnas relevantes
    df_body = df[["id", "body", "direction","createdAt","UserId", "ContactId"]].copy()

    # Usar tqdm para mostrar progreso
    start_time = time.time()
    with Pool(cpu_count()) as pool:
        result = list(tqdm(pool.imap(procesar_fila, df_body.to_dict(orient="records")), total=len(df_body)))

    # Convertir la lista de diccionarios de vuelta a DataFrame
    df_body = pd.DataFrame(result)

    # Actualizar base de datos en un solo paso eficiente
    df_body.to_sql("mi_tabla", conn, if_exists="replace", index=False)

    elapsed_time = time.time() - start_time
    print(f"Procesamiento completado en {elapsed_time:.2f} segundos.")

  0%|          | 0/6943 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
 15%|█▍        | 1023/6943 [06:38<38:27,  2.57it/s]  


KeyboardInterrupt: 

In [4]:
import sqlite3
import pandas as pd
import re
import time
from langdetect import detect, DetectorFactory, LangDetectException
from deep_translator import GoogleTranslator
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from transformers import pipeline

# Configuración inicial
DetectorFactory.seed = 0

# Cargar modelos
ner_model = pipeline("ner", model="mrm8488/bert-spanish-cased-finetuned-ner", aggregation_strategy="simple")
name_generator = pipeline("text-generation", model="PlanTL-GOB-ES/gpt2-large-bne", device=0)  # Usar GPU si está disponible

def detectar_genero(nombre):
    """Detecta el género usando el modelo de lenguaje"""
    prompt = f"""Clasifica el género del siguiente nombre español. Responde solo con una palabra: masculino, femenino o neutral.
Nombre: {nombre.split()[0]}
Género:"""

    response = name_generator(
        prompt,
        max_length=50,
        num_return_sequences=1,
        truncation=True,
        temperature=0.01  # Bajar temperatura para respuestas más determinísticas
    )[0]['generated_text']

    # Extraer la respuesta
    response = response.replace(prompt, "").strip().lower()
    if 'masculino' in response: return "male"
    if 'femenino' in response: return "female"
    return "neutral"

def generar_nombre(original_name):
    """Genera un nombre con el mismo número de palabras y género usando LM"""
    palabras = original_name.split()
    num_palabras = len(palabras)
    genero = detectar_genero(original_name)

    # Definir plantilla según número de palabras
    plantillas = {
        1: "Genera un nombre de pila {gender} español. Solo el nombre sin apellidos.",
        2: "Genera un nombre completo {gender} español con 1 nombre y 1 apellido.",
        3: "Genera un nombre completo {gender} español con 1 nombre y 2 apellidos."
    }

    prompt = plantillas.get(num_palabras, plantillas[3]).format(
        gender="masculino" if genero == "male" else "femenino"
    )

    for _ in range(3):  # Reintentos en caso de error
        response = name_generator(
            prompt,
            max_length=30,
            num_return_sequences=1,
            truncation=True,
            temperature=0.7
        )[0]['generated_text']

        # Limpiar respuesta
        generated = re.sub(r"[^a-zA-ZáéíóúñÁÉÍÓÚÑ\s]", "", response.split("\n")[0]).strip()

        if len(generated.split()) == num_palabras:
            return generated.title()

    # Fallback para errores
    return "Juan Martínez García" if num_palabras >= 3 else "Ana López"

def detectar_entidades(texto):
    """Detecta entidades PER usando BERT"""
    resultados = ner_model(texto)
    entidades = []

    for ent in resultados:
        if ent['entity_group'] == 'PER':
            start = ent['start']
            end = ent['end']
            original = texto[start:end]

            # Ajustar límites de la entidad
            while start > 0 and texto[start-1] not in (' ', '\n', '\t'):
                start -= 1
            while end < len(texto) and texto[end] not in (' ', '\n', '\t'):
                end += 1

            entidades.append((start, end, original.strip()))

    return entidades

def traducir_y_anonimizar(texto):
    """Proceso completo de anonimización"""
    if not texto.strip():
        return texto

    # Traducción a español
    try:
        if len(texto) > 3 and detect(texto) != "es":
            texto = GoogleTranslator(source="auto", target="es").translate(texto)
    except LangDetectException:
        pass

    # Detección y reemplazo de entidades
    entidades = detectar_entidades(texto)
    replacements = []

    for start, end, original in entidades:
        fake_name = generar_nombre(original)
        replacements.append((start, end, fake_name))

    # Aplicar reemplazos en orden inverso
    for start, end, fake_name in sorted(replacements, key=lambda x: -x[0]):
        texto = texto[:start] + fake_name + texto[end:]

    return texto

def procesar_fila(row):
    """Procesa una fila aplicando traducción y anonimización."""
    row["body"] = traducir_y_anonimizar(row["body"])
    return row
    # Conectar a la base de datos
with sqlite3.connect("mi_base_de_datos.db") as conn:
    cursor = conn.cursor()

    # Crear la tabla si no existe
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS mi_tabla (
            id INTEGER PRIMARY KEY,
            body TEXT,
            secret TEXT,
            direction TEXT,
            createdAt TEXT,
            OpenchannelAccountId INTEGER,
            OpenchannelInteractionId INTEGER,
            UserId INTEGER,
            ContactId INTEGER,
            AttachmentId INTEGER,
            sentBy TEXT
        );
    """)
    conn.commit()

    # Cargar datos desde Excel (solo si es necesario)
    df = pd.read_excel("MOSTRA_1.xlsx")
    df.to_sql("mi_tabla", conn, if_exists="replace", index=False)

    # Extraer solo columnas relevantes
    df_body = df[["id", "body", "direction","createdAt","UserId", "ContactId"]].copy()

    # Usar tqdm para mostrar progreso
    start_time = time.time()
    with Pool(cpu_count()) as pool:
        result = list(tqdm(pool.imap(procesar_fila, df_body.to_dict(orient="records")), total=len(df_body)))

    # Convertir la lista de diccionarios de vuelta a DataFrame
    df_body = pd.DataFrame(result)

    # Actualizar base de datos en un solo paso eficiente
    df_body.to_sql("mi_tabla", conn, if_exists="replace", index=False)

    elapsed_time = time.time() - start_time
    print(f"Procesamiento completado en {elapsed_time:.2f} segundos.")

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu
  0%|          | 0/6943 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no pred

KeyboardInterrupt: 