In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import email
import imaplib
from typing import List, Dict, Tuple
import joblib

class EmailPreprocessor:
    def __init__(self):
        nltk.download('stopwords')
        nltk.download('punkt')
        self.stop_words = set(stopwords.words('spanish'))
        self.vectorizer = TfidfVectorizer(max_features=5000)
        
    def clean_text(self, text: str) -> str:
        """Limpia y normaliza el texto del correo."""
        # Convertir a minúsculas
        text = text.lower()
        # Eliminar caracteres especiales
        text = re.sub(r'[^\w\s]', ' ', text)
        # Eliminar números
        text = re.sub(r'\d+', '', text)
        # Tokenización
        tokens = word_tokenize(text)
        # Eliminar stopwords
        tokens = [t for t in tokens if t not in self.stop_words]
        return ' '.join(tokens)
    
    def extract_email_features(self, email_data: str) -> Dict:
        """Extrae características relevantes del correo."""
        msg = email.message_from_string(email_data)
        
        # Extraer texto del cuerpo del correo
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    body = part.get_payload(decode=True).decode()
                    break
        else:
            body = msg.get_payload(decode=True).decode()
            
        return {
            'subject': msg['subject'] or '',
            'body': body,
            'from': msg['from'] or '',
            'date': msg['date'] or ''
        }

class EmailClassifier:
    def __init__(self):
        self.preprocessor = EmailPreprocessor()
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        
    def prepare_data(self, emails_data: List[Dict]) -> Tuple[pd.DataFrame, np.ndarray]:
        """Prepara los datos para el entrenamiento."""
        # Crear DataFrame
        df = pd.DataFrame(emails_data)
        
        # Combinar asunto y cuerpo para el análisis
        df['text'] = df['subject'] + ' ' + df['body']
        
        # Limpiar texto
        df['cleaned_text'] = df['text'].apply(self.preprocessor.clean_text)
        
        # Vectorización
        X = self.preprocessor.vectorizer.fit_transform(df['cleaned_text'])
        y = df['category']
        
        return X, y
    
    def train(self, X, y):
        """Entrena el modelo."""
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        self.model.fit(X_train, y_train)
        
        # Evaluar modelo
        y_pred = self.model.predict(X_test)
        print(classification_report(y_test, y_pred))
        
        return X_test, y_test
    
    def predict(self, email_data: str) -> str:
        """Clasifica un nuevo correo."""
        features = self.preprocessor.extract_email_features(email_data)
        text = features['subject'] + ' ' + features['body']
        cleaned_text = self.preprocessor.clean_text(text)
        vectorized = self.preprocessor.vectorizer.transform([cleaned_text])
        return self.model.predict(vectorized)[0]
    
    def save_model(self, path: str):
        """Guarda el modelo entrenado."""
        joblib.dump({
            'model': self.model,
            'vectorizer': self.preprocessor.vectorizer
        }, path)
    
    @classmethod
    def load_model(cls, path: str):
        """Carga un modelo previamente entrenado."""
        saved_model = joblib.load(path)
        classifier = cls()
        classifier.model = saved_model['model']
        classifier.preprocessor.vectorizer = saved_model['vectorizer']
        return classifier

class EmailManager:
    def __init__(self, email_server: str, username: str, password: str):
        self.server = email_server
        self.username = username
        self.password = password
        self.classifier = EmailClassifier()
        
    def connect(self):
        """Conecta al servidor de correo."""
        self.mail = imaplib.IMAP4_SSL(self.server)
        self.mail.login(self.username, self.password)
    
    def process_inbox(self):
        """Procesa y clasifica los correos de la bandeja de entrada."""
        self.mail.select('inbox')
        _, messages = self.mail.search(None, 'ALL')
        
        for msg_id in messages[0].split():
            _, msg_data = self.mail.fetch(msg_id, '(RFC822)')
            email_body = msg_data[0][1].decode()
            
            # Clasificar correo
            category = self.classifier.predict(email_body)
            
            # Mover a carpeta correspondiente
            self.move_to_folder(msg_id, category)
    
    def move_to_folder(self, msg_id: bytes, category: str):
        """Mueve el correo a la carpeta correspondiente según su clasificación."""
        folder_mapping = {
            'cementos': 'Compras_Cementos',
            'energia': 'Compras_Energia',
            'concretos': 'Compras_Concretos',
            'general': 'Correos_Generales'
        }
        
        folder = folder_mapping.get(category)
        if folder:
            self.mail.copy(msg_id, folder)
            self.mail.store(msg_id, '+FLAGS', '\\Deleted')
            self.mail.expunge()

# Ejemplo de uso
def main():
    # Cargar datos históricos
    historical_data = load_historical_emails()  # Función que deberías implementar
    
    # Crear y entrenar el clasificador
    classifier = EmailClassifier()
    X, y = classifier.prepare_data(historical_data)
    classifier.train(X, y)
    
    # Guardar el modelo entrenado
    classifier.save_model('email_classifier_model.pkl')
    
    # Configurar el gestor de correos
    email_manager = EmailManager(
        email_server='imap.empresaGA.com',
        username='usuario@empresaGA.com',
        password='contraseña'
    )
    
    # Procesar correos nuevos
    email_manager.connect()
    email_manager.process_inbox()

if __name__ == "__main__":
    main()
