In [None]:
!pip install --upgrade keybert
!pip install sentence-transformers
!pip install transformers accelerate
!pip install -U bitsandbytes

In [None]:
import torch
import random
import numpy as np

def set_seed(seed_value):
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map="auto",
    quantization_config=quantization_config,
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

generator = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    max_new_tokens=50,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id,
)


In [None]:
example_prompt = """
<s>[INST]
I have the following document:
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else.
[/INST] delivery, website, received, couple of days, still not received</s>
"""

keyword_prompt = """
[INST]
I have the following document:
- [DOCUMENT]

KeyBERT has generated the following candidate keywords:
[CANDIDATES]

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document and just one list
[/INST]
"""

prompt_template = example_prompt + keyword_prompt

In [None]:
from keybert.llm import TextGeneration
from keybert import KeyLLM

llm = TextGeneration(generator, prompt=prompt_template)
kw_model = KeyLLM(llm)

In [None]:
documents = [
    "The website mentions that it only takes a couple of days to deliver but I still have not received mine.",
    "I received my package!",
    "Meta released LLaMA's model weights to the research community under a noncommercial license.",
    "My name is Juanje and I like dicks",
    "I love viagra and I want hardsex",
]


print(documents[-1])

In [None]:
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

kw_model = KeyBERT(llm=llm, model=embedding_model)

keywords = kw_model.extract_keywords(documents, keyphrase_ngram_range=(1, 3), threshold=0.9, diversity=0.7, use_mmr=True, top_n=7)
print(keywords)

In [None]:
pip install re

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
from torch.utils.data import Dataset, DataLoader
from keybert.llm import TextGeneration
from keybert import KeyLLM, KeyBERT
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import json
import os
import pickle
from datetime import datetime
import logging

# Configurar logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class EmailDataset(Dataset):
    """Dataset personalizado para cargar emails de forma eficiente"""
    
    def __init__(self, dataframe, start_idx=0):
        self.df = dataframe.iloc[start_idx:].reset_index(drop=True)
        self.original_indices = dataframe.iloc[start_idx:].index.tolist()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            'index': self.original_indices[idx],
            'body': str(row['body']) if pd.notna(row['body']) else "",
            'subject': str(row['subject']) if pd.notna(row['subject']) else ""
        }

class KeywordExtractor:
    """Extractor de keywords con sistema de checkpoint"""
    
    def __init__(self, df, checkpoint_file='keyword_extraction_checkpoint.pkl', 
                 results_file='keyword_results_temp.csv'):
        self.df = df.copy()
        self.checkpoint_file = checkpoint_file
        self.results_file = results_file
        self.model = None
        self.generator = None
        self.kw_model = None
        
        # Inicializar columna de keywords si no existe
        if 'keywords' not in self.df.columns:
            self.df['keywords'] = None
            
    def setup_model(self):
        """Configurar el modelo y pipeline"""
        logger.info("Configurando modelo...")
        
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            "mistralai/Mistral-7B-Instruct-v0.1",
            device_map="auto",
            quantization_config=quantization_config,
        )

        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

        self.generator = pipeline(
            model=self.model,
            tokenizer=tokenizer,
            task="text-generation",
            max_new_tokens=50,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
        )

        # Configurar prompt template
        example_prompt = """
<s>[INST]
I have the following document:
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else.
[/INST] delivery, website, received, couple of days, still not received</s>
"""

        keyword_prompt = """
[INST]
I have the following document:
- [DOCUMENT]

KeyBERT has generated the following candidate keywords:
[CANDIDATES]

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document and just one list
[/INST]
"""

        prompt_template = example_prompt + keyword_prompt

        # Configurar KeyBERT con LLM
        llm = TextGeneration(self.generator, prompt=prompt_template)
        
        # Modelo de embeddings
        embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
        
        self.kw_model = KeyBERT(llm=llm, model=embedding_model)
        
        logger.info("Modelo configurado correctamente")
    
    def load_checkpoint(self):
        """Cargar checkpoint si existe"""
        if os.path.exists(self.checkpoint_file):
            try:
                with open(self.checkpoint_file, 'rb') as f:
                    checkpoint = pickle.load(f)
                
                processed_indices = checkpoint.get('processed_indices', set())
                
                # Cargar resultados parciales si existen
                if os.path.exists(self.results_file):
                    temp_results = pd.read_csv(self.results_file)
                    for _, row in temp_results.iterrows():
                        if row['index'] in self.df.index:
                            self.df.loc[row['index'], 'keywords'] = row['keywords']
                
                logger.info(f"Checkpoint cargado: {len(processed_indices)} emails ya procesados")
                return processed_indices
                
            except Exception as e:
                logger.warning(f"Error cargando checkpoint: {e}")
                return set()
        
        return set()
    
    def save_checkpoint(self, processed_indices):
        """Guardar checkpoint"""
        try:
            checkpoint = {
                'processed_indices': processed_indices,
                'timestamp': datetime.now().isoformat(),
                'total_processed': len(processed_indices)
            }
            
            with open(self.checkpoint_file, 'wb') as f:
                pickle.dump(checkpoint, f)
                
            # Guardar resultados parciales
            processed_df = self.df[self.df['keywords'].notna()].copy()
            processed_df = processed_df.reset_index()
            processed_df[['index', 'keywords']].to_csv(self.results_file, index=False)
            
            logger.info(f"Checkpoint guardado: {len(processed_indices)} emails procesados")
            self.df.to_pickle("emails_with_keywords.pkl")

            
        except Exception as e:
            logger.error(f"Error guardando checkpoint: {e}")
    
    def clean_text(self, text):
        """Limpiar y normalizar texto"""

        text = text.replace('\n', ' ').replace('\r', ' ')
        text = ' '.join(text.split())  # Normalizar espacios
        # Remover caracteres especiales y números, mantener solo letras y espacios
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    
        # Eliminar URLs y direcciones de correo
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S*@\S*\s?', '', text) # Eliminar emails
        re.sub(r'https?://\S+', ' ', text) # Por si queda algún http o https suelto
    
        # Eliminar caracteres especiales y puntuación (dejando solo letras y números por ahora)
        text = re.sub(r'[^a-z0-9\s]', '', text)
    
        # 5. Eliminar números sueltos y códigos alfanuméricos largos
        text = re.sub(r'\b\d+\b', '', text) # Eliminar números sueltos
        # Eliminar palabras que son predominantemente números o códigos (ej. >4 chars y tiene un número)
        text = " ".join([word for word in text.split() if not (len(word) > 3 and any(char.isdigit() for char in word) and not any(char.isalpha() for char in word.replace('.','')))]) # more specific for codes
        text = re.sub(r'\b[a-z]*\d+[a-z\d]*\b', lambda m: '' if len(m.group(0)) > 5 and sum(c.isdigit() for c in m.group(0)) > sum(c.isalpha() for c in m.group(0)) / 2 else m.group(0), text)
        
        
            
        return text
    
    def extract_keywords_batch(self, texts):
        """Extraer keywords de un lote de textos"""
        if not texts:
            return [""] * len(texts)
        
        try:
            # Filtrar textos válidos
            valid_texts = []
            valid_indices = []
            
            for i, text in enumerate(texts):
                cleaned = self.clean_text(text)
                if cleaned:
                    valid_texts.append(cleaned)
                    valid_indices.append(i)
            
            if not valid_texts:
                return [""] * len(texts)
            
            # Extraer keywords en lote
            keywords_results = self.kw_model.extract_keywords(
                valid_texts, 
                keyphrase_ngram_range=(1, 3), 
                threshold=0.9, 
                diversity=0.7, 
                use_mmr=True, 
                top_n=5
            )
            
            # Procesar resultados
            results = [""] * len(texts)
            
            for i, valid_idx in enumerate(valid_indices):
                if i < len(keywords_results) and keywords_results[i]:
                    # Extraer solo las keywords (sin scores)
                    keyword_list = [kw[0] if isinstance(kw, tuple) else kw for kw in keywords_results[i]]
                    results[valid_idx] = ", ".join(keyword_list)
            
            return results
                
        except Exception as e:
            logger.error(f"Error extrayendo keywords en lote: {e}")
            return [""] * len(texts)
    
    def process_emails(self, batch_size=8, save_every=10):
        """Procesar emails con sistema de checkpoint usando lotes verdaderos"""
        
        # Cargar checkpoint
        processed_indices = self.load_checkpoint()
        
        # Filtrar emails no procesados
        remaining_df = self.df[~self.df.index.isin(processed_indices)]
        
        if len(remaining_df) == 0:
            logger.info("Todos los emails ya han sido procesados")
            return self.df
        
        logger.info(f"Procesando {len(remaining_df)} emails restantes de {len(self.df)} totales")
        
        # Crear dataset
        dataset = EmailDataset(remaining_df)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        
        # Procesar con barra de progreso
        with tqdm(total=len(dataset), desc="Extrayendo keywords") as pbar:
            batch_count = 0
            
            for batch in dataloader:
                # Preparar lote de textos
                batch_texts = []
                batch_indices = []
                
                for i in range(len(batch['index'])):
                    idx = batch['index'][i].item()
                    body = batch['body'][i]
                    subject = batch['subject'][i]
                    
                    # Combinar subject y body
                    full_text = f"{subject} {body}".strip()
                    batch_texts.append(full_text)
                    batch_indices.append(idx)
                
                try:
                    # Procesar lote completo de una vez
                    batch_keywords = self.extract_keywords_batch(batch_texts)
                    
                    # Asignar resultados
                    for idx, keywords in zip(batch_indices, batch_keywords):
                        self.df.loc[idx, 'keywords'] = keywords
                        processed_indices.add(idx)
                    
                    # Actualizar barra de progreso
                    pbar.update(len(batch_indices))
                    
                    # Calcular estadísticas del lote
                    non_empty_keywords = [k for k in batch_keywords if k]
                    avg_keywords = np.mean([len(k.split(', ')) for k in non_empty_keywords]) if non_empty_keywords else 0
                    
                    pbar.set_postfix({
                        'Último lote': f"{batch_indices[0]}-{batch_indices[-1]}",
                        'Avg keywords': f"{avg_keywords:.1f}",
                        'Con keywords': f"{len(non_empty_keywords)}/{len(batch_keywords)}"
                    })
                    
                except Exception as e:
                    logger.error(f"Error procesando lote {batch_indices}: {e}")
                    # En caso de error, asignar keywords vacías
                    for idx in batch_indices:
                        self.df.loc[idx, 'keywords'] = ""
                        processed_indices.add(idx)
                    pbar.update(len(batch_indices))
                
                batch_count += 1
                
                # Guardar checkpoint cada ciertos batches
                if batch_count % save_every == 0:
                    self.save_checkpoint(processed_indices)
        
        # Guardar checkpoint final
        self.save_checkpoint(processed_indices)
        
        logger.info(f"Procesamiento completado. {len(processed_indices)} emails procesados")
        
        return self.df
    
    def cleanup_temp_files(self):
        """Limpiar archivos temporales después del procesamiento exitoso"""
        try:
            if os.path.exists(self.checkpoint_file):
                os.remove(self.checkpoint_file)
            if os.path.exists(self.results_file):
                os.remove(self.results_file)
            logger.info("Archivos temporales eliminados")
        except Exception as e:
            logger.warning(f"Error eliminando archivos temporales: {e}")

# Función principal para usar el extractor
def extract_keywords_from_dataframe(df, checkpoint_file='keyword_extraction_checkpoint.pkl', 
                                  results_file='keyword_results_temp.csv',
                                  batch_size=8, save_every=10, cleanup_after=True):
    """
    Función principal para extraer keywords de un dataframe
    
    Args:
        df: DataFrame con columnas 'body' y 'subject'
        checkpoint_file: Archivo para guardar progreso
        results_file: Archivo temporal para resultados
        batch_size: Tamaño del batch (recomendado 4-8 para GPU)
        save_every: Guardar checkpoint cada N batches
        cleanup_after: Limpiar archivos temporales al finalizar
    
    Returns:
        DataFrame con columna 'keywords' añadida
    """
    
    extractor = KeywordExtractor(df, checkpoint_file, results_file)
    
    # Configurar modelo
    extractor.setup_model()
    
    # Procesar emails
    result_df = extractor.process_emails(batch_size=batch_size, save_every=save_every)
    
    # Limpiar archivos temporales si se solicita
    if cleanup_after:
        extractor.cleanup_temp_files()
    
    return result_df

# Ejemplo de uso:

if os.path.exists("emails_with_keywords.pkl"):
    df = pd.read_pickle("emails_with_keywords.pkl")  # ✅ Reanuda si existe
else:
    df = pd.read_csv("emails.csv")                   # 🆕 Comienza desde cero

# Procesar emails en lotes (más eficiente)
df_with_keywords = extract_keywords_from_dataframe(
    df, 
    batch_size=8,          # Procesa 8 emails a la vez
    save_every=5,          # Guardar cada 5 lotes procesados
    cleanup_after=False    # <-- Aquí poner False para mantener checkpoint y pkl y poder reanudar
)

# Guardar resultado parcial/final para poder reanudar
df_with_keywords.to_pickle('emails_with_keywords.pkl')  # Guarda el progreso en formato pkl

# Opcionalmente, si quieres exportar a CSV para revisar
df_with_keywords.to_csv('emails_with_keywords.csv', index=False)

# Para reanudar después de una interrupción, vuelve a ejecutar este script completo.