In [7]:
import json
import numpy as np
import pandas as pd
import spacy
import logging
from datetime import datetime
import os
import pickle
import re
import string
from typing import List, Tuple, Dict, Any
from pathlib import Path

# TensorFlow/Keras imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Dropout, TimeDistributed, BatchNormalization,
    Conv1D, LSTM, MultiHeadAttention, Concatenate, GlobalMaxPooling1D,
    Bidirectional, Add, LayerNormalization
)
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1_l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import difflib

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    """Configuration optimisée pour le modèle NER (extraction de mots-clés)"""
    def __init__(self):
        # Data paths
        self.parquet_files = [
            "validation-00000-of-00001 (1).parquet",
            "test-00000-of-00001.parquet",
            "train-00000-of-00001.parquet"
        ]
        self.columns_keep = ['text', 'keywords', 'topic']
        self.model_save_path = Path("models/")
        self.logs_path = Path("logs/")
        self.results_path = Path("results/")

        # NER parameters (optimisés pour mots-clés)
        self.test_size = 0.15
        self.val_size = 0.15
        self.random_state = 42
        self.vocab_size = 15000  # Réduit pour éviter l'overfitting
        self.oov_token = "<OOV>"
        self.pad_token = "<PAD>"
        self.embedding_dim = 128  # Réduit
        self.filters = [64, 128, 64]  # Réduit
        self.kernel_sizes = [2, 3, 4]  # Ajusté
        self.dropout_rate = 0.3  # Réduit
        self.l1_reg = 1e-5
        self.l2_reg = 1e-4
        self.batch_size = 16  # Réduit
        self.epochs_ner = 10  # Augmenté
        self.learning_rate = 1e-3  # Augmenté
        self.patience_early_stopping = 5  # Réduit
        self.patience_lr_reduce = 3
        self.lr_factor = 0.5
        self.min_lr = 1e-6

        # Paramètres d'optimisation
        self.use_bidirectional = True
        self.use_attention_ner = False  # Désactivé pour simplifier
        self.gradient_clip_norm = 1.0

        # Nouveaux paramètres pour améliorer la correspondance
        self.similarity_threshold = 0.8
        self.min_keyword_length = 2
        self.max_sequence_length = 128  # Réduit

        # Create directories
        for path in [self.model_save_path, self.logs_path, self.results_path]:
            path.mkdir(parents=True, exist_ok=True)

class DataProcessor:
    """Gestion des données pour extraction de mots-clés depuis Parquet"""
    def __init__(self, config: Config):
        self.config = config
        self.ner_tokenizer = None
        self.nlp = self._load_spacy_model()
        self.class_weights = None
        self.df_combined = None

    def _load_spacy_model(self):
        """Chargement sécurisé du modèle spaCy"""
        try:
            return spacy.load("en_core_web_sm", disable=["parser", "ner"])
        except OSError:
            try:
                return spacy.load("fr_core_news_sm", disable=["parser", "ner"])
            except OSError:
                logger.warning("Aucun modèle spaCy trouvé, utilisation de tokenisation basique")
                return None

    def load_parquet_data(self) -> pd.DataFrame:
        """Chargement des fichiers Parquet"""
        logger.info("Chargement des fichiers Parquet...")

        # Vérifier l'existence des fichiers
        existing_files = []
        for file in self.config.parquet_files:
            if Path(file).exists():
                existing_files.append(file)
            else:
                logger.warning(f"Fichier non trouvé: {file}")

        if not existing_files:
            raise FileNotFoundError("Aucun fichier Parquet trouvé")

        # Lire et combiner les fichiers
        dfs = []
        for file in existing_files:
            try:
                df = pd.read_parquet(file)[self.config.columns_keep]
                dfs.append(df)
                logger.info(f"Chargé {file}: {len(df)} lignes")
            except Exception as e:
                logger.error(f"Erreur lors du chargement de {file}: {e}")

        self.df_combined = pd.concat(dfs, ignore_index=True)

        # Nettoyer les données NaN
        self.df_combined = self.df_combined.dropna(subset=['text', 'keywords'])

        logger.info(f"Dataset combiné: {len(self.df_combined)} lignes")
        return self.df_combined

    def clean_text(self, text: str) -> str:
        """Nettoyage du texte"""
        if pd.isna(text) or not isinstance(text, str):
            return ""

        # Garder plus de ponctuation pour une meilleure tokenisation
        text = re.sub(r'[^\w\s\.\,\!\?\-\(\)\'\"]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def parse_keywords(self, keywords_str: str) -> List[str]:
        """Parse la chaîne de mots-clés pour extraire une liste"""
        if pd.isna(keywords_str) or not isinstance(keywords_str, str):
            return []

        keywords_str = keywords_str.strip()

        # Si c'est une liste Python en string
        if keywords_str.startswith('[') and keywords_str.endswith(']'):
            try:
                import ast
                return ast.literal_eval(keywords_str)
            except:
                keywords_str = keywords_str[1:-1]

        # Diviser par virgule et nettoyer
        keywords = [kw.strip().strip('"\'') for kw in keywords_str.split(',')]
        keywords = [kw for kw in keywords if kw and len(kw) >= self.config.min_keyword_length]
        return keywords

    def tokenize_sentence(self, sentence: str) -> List[str]:
        """Tokenisation d'une phrase"""
        if not sentence.strip():
            return []

        if self.nlp is not None:
            try:
                doc = self.nlp(sentence)
                return [token.text.lower() for token in doc if not token.is_space and not token.is_punct]
            except Exception as e:
                logger.warning(f"Erreur lors de la tokenisation spaCy: {e}")

        # Tokenisation basique
        tokens = re.findall(r'\b\w+\b', sentence.lower())
        return [token for token in tokens if len(token) >= 2]

    def find_keyword_matches(self, tokens: List[str], keywords: List[str]) -> List[int]:
        """Trouve les correspondances entre tokens et mots-clés avec fuzzy matching"""
        labels = [0] * len(tokens)

        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            keyword_tokens = self.tokenize_sentence(keyword_lower)

            if not keyword_tokens:
                continue

            # Correspondance exacte
            for i, token in enumerate(tokens):
                if token in keyword_tokens or any(kt in token for kt in keyword_tokens):
                    labels[i] = 1

            # Correspondance floue pour les mots-clés composés
            if len(keyword_tokens) > 1:
                keyword_text = ' '.join(keyword_tokens)
                token_text = ' '.join(tokens)

                # Chercher des sous-séquences
                for i in range(len(tokens) - len(keyword_tokens) + 1):
                    window = ' '.join(tokens[i:i+len(keyword_tokens)])
                    similarity = difflib.SequenceMatcher(None, window, keyword_text).ratio()

                    if similarity >= self.config.similarity_threshold:
                        for j in range(i, i + len(keyword_tokens)):
                            if j < len(labels):
                                labels[j] = 1

        return labels

    def create_ner_dataset(self, df: pd.DataFrame) -> Tuple[List, List]:
        """Créer un dataset NER à partir du DataFrame"""
        sentences = []
        labels = []

        logger.info("Création du dataset NER...")
        valid_count = 0
        skipped_count = 0

        # Limiter le dataset pour les tests
        df_sample = df.head(5000) if len(df) > 5000 else df

        for idx, row in df_sample.iterrows():
            if idx % 500 == 0:
                logger.info(f"Traitement ligne {idx}/{len(df_sample)}")

            text = self.clean_text(row['text'])
            keywords_list = self.parse_keywords(row['keywords'])

            if not text or not keywords_list:
                skipped_count += 1
                continue

            # Tokeniser le texte
            tokens = self.tokenize_sentence(text)
            if len(tokens) < 3 or len(tokens) > self.config.max_sequence_length:
                skipped_count += 1
                continue

            # Créer les labels avec fuzzy matching
            token_labels = self.find_keyword_matches(tokens, keywords_list)

            # Vérifier qu'il y a au moins un mot-clé identifié
            if sum(token_labels) == 0:
                skipped_count += 1
                continue

            sentences.append(tokens)
            labels.append(token_labels)
            valid_count += 1

        logger.info(f"Dataset créé: {valid_count} phrases valides, {skipped_count} ignorées")

        # Statistiques
        total_keywords = sum(sum(label_seq) for label_seq in labels)
        total_tokens = sum(len(label_seq) for label_seq in labels)
        keyword_ratio = total_keywords / total_tokens if total_tokens > 0 else 0

        logger.info(f"Statistiques: {total_keywords} mots-clés sur {total_tokens} mots ({keyword_ratio:.2%})")

        return sentences, labels

    def prepare_ner_data(self, sentences: List, labels: List) -> Tuple:
        """Préparation des données avec division simple"""
        logger.info("Préparation des données...")

        # Filtrer les séquences valides
        valid_data = []
        for sent, lab in zip(sentences, labels):
            if len(sent) == len(lab) and len(sent) > 0:
                keyword_ratio = sum(lab) / len(lab)
                # Garder les phrases avec un ratio raisonnable de mots-clés
                if 0.01 <= keyword_ratio <= 0.5:
                    valid_data.append((sent, lab))

        if len(valid_data) < 100:
            raise ValueError("Pas assez de données valides pour l'entraînement")

        sentences, labels = zip(*valid_data)
        sentences, labels = list(sentences), list(labels)

        # Division simple
        train_size = int(0.7 * len(sentences))
        val_size = int(0.15 * len(sentences))

        sentences_train = sentences[:train_size]
        labels_train = labels[:train_size]

        sentences_val = sentences[train_size:train_size + val_size]
        labels_val = labels[train_size:train_size + val_size]

        sentences_test = sentences[train_size + val_size:]
        labels_test = labels[train_size + val_size:]

        logger.info(f"Division: Train={len(sentences_train)}, Val={len(sentences_val)}, Test={len(sentences_test)}")
        return sentences_train, sentences_val, sentences_test, labels_train, labels_val, labels_test

    def create_ner_tokenizer(self, sentences_train: List, sentences_val: List):
        """Création du tokenizer NER"""
        all_words = []
        for sentence in sentences_train + sentences_val:
            all_words.extend(sentence)

        word_freq = Counter(all_words)
        # Garder les mots qui apparaissent au moins 2 fois
        filtered_words = [word for word, freq in word_freq.items() if freq >= 2]

        self.ner_tokenizer = Tokenizer(
            num_words=self.config.vocab_size,
            oov_token=self.config.oov_token,
            filters='',
            lower=True
        )

        filtered_sentences = [" ".join(sentence) for sentence in sentences_train + sentences_val]
        self.ner_tokenizer.fit_on_texts(filtered_sentences)

        logger.info(f"Tokenizer créé: vocab_size={len(self.ner_tokenizer.word_index)}")
        logger.info(f"Mots les plus fréquents: {list(self.ner_tokenizer.word_index.items())[:10]}")

    def tokenize_and_pad_ner(self, sentences: List, labels: List, maxlen: int) -> Tuple[np.ndarray, np.ndarray]:
        """Tokenisation et padding pour NER"""
        sequences = []
        valid_labels = []

        for sentence, label_seq in zip(sentences, labels):
            # Tokeniser
            seq = self.ner_tokenizer.texts_to_sequences([" ".join(sentence)])[0]

            if len(seq) > 0:  # Garder seulement les séquences non vides
                sequences.append(seq)
                valid_labels.append(label_seq)

        X = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

        y = []
        for label_seq in valid_labels:
            # Tronquer ou padding des labels
            if len(label_seq) > maxlen:
                padded_labels = label_seq[:maxlen]
            else:
                padded_labels = label_seq + [0] * (maxlen - len(label_seq))
            y.append(padded_labels)

        y = np.array(y).reshape(-1, maxlen, 1)

        logger.info(f"Données tokenisées: X.shape={X.shape}, y.shape={y.shape}")
        return X, y


    def compute_class_weights(self, labels_train: List) -> Dict:
        """Calcul des poids de classe"""
        flat_labels = [label for label_seq in labels_train for label in label_seq]
        unique_labels = np.unique(flat_labels)

        # Ensure both 0 and 1 are present
        all_classes = np.array([0, 1])
        present_labels = np.union1d(unique_labels, all_classes)


        class_weights = compute_class_weight('balanced', classes=present_labels, y=flat_labels)

        # Limiter les poids extrêmes
        class_weights = np.clip(class_weights, 0.5, 5.0)

        # Ensure weights are a dictionary with keys 0 and 1
        self.class_weights = dict(zip(present_labels, class_weights))
        logger.info(f"Poids de classe calculés: {self.class_weights}")
        return self.class_weights

    def evaluate_ner_model(self, model, X_test, y_test, tokenizer) -> Dict:
        """Évaluation complète du modèle NER"""
        predictions = model.predict(X_test, verbose=0)
        y_pred = (predictions > 0.5).astype(int)

        y_true_flat = []
        y_pred_flat = []

        for i in range(len(y_test)):
            # Prendre seulement les tokens non-paddés
            seq_len = np.sum(X_test[i] != 0)
            y_true_flat.extend(y_test[i][:seq_len, 0])
            y_pred_flat.extend(y_pred[i][:seq_len, 0])

        # Éviter les erreurs avec des arrays vides
        if len(y_true_flat) == 0 or len(set(y_true_flat)) < 2:
            logger.warning("Pas assez de données pour l'évaluation")
            return {
                'precision': 0.0,
                'recall': 0.0,
                'f1_score': 0.0,
                'support': len(y_true_flat)
            }

        precision = precision_score(y_true_flat, y_pred_flat, average='binary', zero_division=0)
        recall = recall_score(y_true_flat, y_pred_flat, average='binary', zero_division=0)
        f1 = f1_score(y_true_flat, y_pred_flat, average='binary', zero_division=0)

        metrics = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'support': len(y_true_flat)
        }

        logger.info(f"Métriques finales - Précision: {precision:.3f}, Rappel: {recall:.3f}, F1: {f1:.3f}")
        return metrics

    def extract_keywords_from_text(self, model, text: str, tokenizer, maxlen: int) -> List[str]:
        """Extraction des mots-clés d'un texte donné"""
        if not text.strip():
            return []

        # Nettoyer et tokeniser le texte
        clean_text = self.clean_text(text)
        tokens = self.tokenize_sentence(clean_text)
        if not tokens:
            return []

        # Conversion en séquence
        sequence = tokenizer.texts_to_sequences([" ".join(tokens)])[0]
        if not sequence:
            return []

        # Padding
        X = pad_sequences([sequence], maxlen=maxlen, padding='post')

        # Prédiction
        predictions = model.predict(X, verbose=0)
        y_pred = (predictions > 0.3).astype(int)  # Seuil plus bas

        # Extraction des mots-clés
        keywords = []
        for i, (token, pred) in enumerate(zip(tokens, y_pred[0])):
            if i < len(tokens) and i < len(y_pred[0]) and pred[0] == 1:
                keywords.append(token)

        return keywords

# Custom Binary Crossentropy Loss with Class Weights
def weighted_binary_crossentropy(class_weights):
    def loss(y_true, y_pred):
        y_true_flat = tf.cast(tf.reshape(y_true, [-1]), tf.float32)
        y_pred_flat = tf.reshape(y_pred, [-1])

        # Calculer les poids
        weight_0 = class_weights.get(0, 1.0)
        weight_1 = class_weights.get(1, 1.0)

        # Ensure weights tensor is float32
        weights = tf.where(tf.equal(y_true_flat, 1), tf.constant(weight_1, dtype=tf.float32), tf.constant(weight_0, dtype=tf.float32))

        # Binary crossentropy
        bce = tf.keras.losses.binary_crossentropy(y_true_flat, y_pred_flat)
        weighted_bce = bce * weights

        return tf.reduce_mean(weighted_bce)

    return loss

# Custom F1 Score Metric
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_state(self):
        self.precision.reset_state()
        self.recall.reset_state()

class KeywordExtractionModel:
    """Modèle simplifié pour extraction de mots-clés"""
    def __init__(self, config: Config, vocab_size: int):
        self.config = config
        self.vocab_size = vocab_size
        self.model = None

    def build_model(self, maxlen: int, class_weights: Dict) -> Model:
        """Construction du modèle simplifié"""
        inputs = Input(shape=(maxlen,), name='input_tokens')

        # Embedding
        embedding = Embedding(
            input_dim=self.vocab_size,
            output_dim=self.config.embedding_dim,
            mask_zero=True,
            name='embedding'
        )(inputs)
        x = Dropout(self.config.dropout_rate)(embedding)

        # Couches convolutionnelles simplifiées
        conv_outputs = []
        for i, (filters, kernel_size) in enumerate(zip(self.config.filters, self.config.kernel_sizes)):
            conv = Conv1D(
                filters, kernel_size,
                activation='relu',
                padding='same',
                name=f'conv_{i+1}'
            )(x)
            conv = Dropout(self.config.dropout_rate)(conv)
            conv_outputs.append(conv)

        # Concaténation
        if len(conv_outputs) > 1:
            x = Concatenate(name='concat_conv')(conv_outputs)
        else:
            x = conv_outputs[0]

        # LSTM bidirectionnel
        if self.config.use_bidirectional:
            x = Bidirectional(
                LSTM(64, return_sequences=True, dropout=self.config.dropout_rate),
                name='bilstm'
            )(x)

        # Couche dense finale
        x = TimeDistributed(
            Dense(32, activation='relu'),
            name='dense_1'
        )(x)
        x = Dropout(self.config.dropout_rate)(x)

        # Sortie
        outputs = TimeDistributed(Dense(1, activation='sigmoid'), name='keyword_output')(x)

        model = Model(inputs=inputs, outputs=outputs, name='Keyword_Extraction_Model')

        # Optimiseur
        optimizer = AdamW(
            learning_rate=self.config.learning_rate,
            clipnorm=self.config.gradient_clip_norm
        )

        # Loss function
        custom_loss = weighted_binary_crossentropy(class_weights)

        model.compile(
            optimizer=optimizer,
            loss=custom_loss,
            metrics=['accuracy', 'precision', 'recall', F1Score()]
        )

        logger.info(f"Modèle construit - Paramètres: {model.count_params():,}")
        return model

    def create_callbacks(self, timestamp: str) -> List:
        """Callbacks pour l'entraînement"""
        return [
            EarlyStopping(
                monitor='val_f1_score',
                patience=self.config.patience_early_stopping,
                restore_best_weights=True,
                mode='max',
                verbose=1
            ),
            ModelCheckpoint(
                filepath=self.config.model_save_path / f"best_keyword_model_{timestamp}.keras",
                monitor='val_f1_score',
                save_best_only=True,
                mode='max',
                verbose=1
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=self.config.lr_factor,
                patience=self.config.patience_lr_reduce,
                min_lr=self.config.min_lr,
                verbose=1
            )
        ]

class ModelTrainer:
    """Gestionnaire d'entraînement du modèle d'extraction de mots-clés"""
    def __init__(self, config: Config):
        self.config = config
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    def train_keyword_model(self, data_processor: DataProcessor) -> Tuple[Model, Dict]:
        """Entraînement complet du modèle d'extraction de mots-clés"""
        logger.info("=== DÉBUT ENTRAÎNEMENT MODÈLE EXTRACTION MOTS-CLÉS ===")

        # Chargement des données Parquet
        df = data_processor.load_parquet_data()

        # Création du dataset NER
        sentences, labels = data_processor.create_ner_dataset(df)

        if len(sentences) == 0:
            raise ValueError("Aucune phrase valide trouvée dans les données")

        # Préparation des données
        sentences_train, sentences_val, sentences_test, labels_train, labels_val, labels_test = \
            data_processor.prepare_ner_data(sentences, labels)

        # Création du tokenizer
        data_processor.create_ner_tokenizer(sentences_train, sentences_val)

        # Calcul de la longueur maximale
        all_sentences = sentences_train + sentences_val
        max_len_found = max(len(s) for s in all_sentences) if all_sentences else 50
        maxlen = min(max_len_found, self.config.max_sequence_length)

        logger.info(f"Longueur max des séquences: {maxlen}")

        # Tokenisation et padding
        X_train, y_train = data_processor.tokenize_and_pad_ner(sentences_train, labels_train, maxlen)
        X_val, y_val = data_processor.tokenize_and_pad_ner(sentences_val, labels_val, maxlen)
        X_test, y_test = data_processor.tokenize_and_pad_ner(sentences_test, labels_test, maxlen)

        # Vérification des données
        if X_train.shape[0] == 0:
            raise ValueError("Aucune donnée d'entraînement valide")

        # Calcul des poids de classe
        class_weights = data_processor.compute_class_weights(labels_train)

        # Construction du modèle
        model_builder = KeywordExtractionModel(self.config, len(data_processor.ner_tokenizer.word_index) + 1)
        model = model_builder.build_model(maxlen, class_weights)

        # Callbacks
        callbacks = model_builder.create_callbacks(self.timestamp)

        # Entraînement
        logger.info("Début de l'entraînement...")
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=self.config.epochs_ner,
            batch_size=self.config.batch_size,
            callbacks=callbacks,
            verbose=1
        )

        # Évaluation
        metrics = data_processor.evaluate_ner_model(model, X_test, y_test, data_processor.ner_tokenizer)

        # Test d'extraction de mots-clés
        test_texts = [
            "This is an example text to test automatic keyword extraction from important content.",
            "Machine learning models can extract important keywords from text documents.",
            "Natural language processing helps computers understand human language."
        ]

        for test_text in test_texts:
            extracted_keywords = data_processor.extract_keywords_from_text(
                model, test_text, data_processor.ner_tokenizer, maxlen
            )
            logger.info(f"Test - Texte: '{test_text}'")
            logger.info(f"Mots-clés extraits: {extracted_keywords}")

        # Sauvegarde
        model_path = self.config.model_save_path / f"keyword_extraction_model_{self.timestamp}.keras"
        model.save(model_path)

        tokenizer_path = self.config.model_save_path / f"keyword_tokenizer_{self.timestamp}.pkl"
        with open(tokenizer_path, 'wb') as f:
            pickle.dump(data_processor.ner_tokenizer, f)

        # Sauvegarde des résultats
        results = {
            'model_path': str(model_path),
            'tokenizer_path': str(tokenizer_path),
            'metrics': metrics,
            'history': {k: [float(v) for v in vals] for k, vals in history.history.items()},
            'config': {k: str(v) if isinstance(v, Path) else v for k, v in self.config.__dict__.items()},
            'maxlen': maxlen,
            'vocab_size': len(data_processor.ner_tokenizer.word_index),
            'test_extractions': [
                {
                    'text': text,
                    'keywords': data_processor.extract_keywords_from_text(
                        model, text, data_processor.ner_tokenizer, maxlen
                    )
                }
                for text in test_texts
            ]
        }

        results_path = self.config.results_path / f"keyword_extraction_results_{self.timestamp}.json"
        with open(results_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        logger.info(f"=== ENTRAÎNEMENT TERMINÉ - F1 Score: {metrics['f1_score']:.3f} ===")
        return model, results

def main():
    """Fonction principale d'exécution"""
    try:
        logger.info("=== DÉBUT DU PIPELINE D'EXTRACTION DE MOTS-CLÉS ===")

        # Configuration
        config = Config()
        logger.info(f"Configuration chargée")

        # Initialisation
        data_processor = DataProcessor(config)
        trainer = ModelTrainer(config)

        # Entraînement du modèle d'extraction de mots-clés
        logger.info("Lancement de l'entraînement du modèle d'extraction de mots-clés...")
        keyword_model, keyword_results = trainer.train_keyword_model(data_processor)

        # Résumé final
        logger.info("=== ENTRAÎNEMENT TERMINÉ AVEC SUCCÈS ===")
        logger.info(f"Modèle d'extraction de mots-clés - F1 Score: {keyword_results['metrics']['f1_score']:.3f}")
        logger.info(f"Modèle sauvegardé dans: {config.model_save_path}")
        logger.info(f"Résultats sauvegardés dans: {config.results_path}")

        return {
            'keyword_model': keyword_model,
            'keyword_results': keyword_results,
            'data_processor': data_processor
        }

    except Exception as e:
        logger.error(f"Erreur lors de l'exécution: {str(e)}")
        raise

# Fonction utilitaire pour charger un modèle sauvegardé
def load_trained_model(model_path: str, tokenizer_path: str, config: Config = None):
    """Charge un modèle et tokenizer sauvegardés"""
    if config is None:
        config = Config()

    # Charger le modèle
    model = load_model(model_path, custom_objects={
        'F1Score': F1Score,
        'weighted_binary_crossentropy': weighted_binary_crossentropy
    })

    # Charger le tokenizer
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)

    # Créer le data processor
    data_processor = DataProcessor(config)
    data_processor.ner_tokenizer = tokenizer

    return model, data_processor

# Fonction utilitaire pour extraire des mots-clés d'un nouveau texte
def extract_keywords(text: str, model_path: str, tokenizer_path: str, maxlen: int = 128):
    """Extrait les mots-clés d'un texte avec un modèle pré-entraîné"""
    config = Config()
    model, data_processor = load_trained_model(model_path, tokenizer_path, config)

    keywords = data_processor.extract_keywords_from_text(
        model, text, data_processor.ner_tokenizer, maxlen
    )

    return keywords

if __name__ == "__main__":
    # Configuration de l'environnement
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    tf.get_logger().setLevel('ERROR')

    # Gestion de la mémoire GPU
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            logger.warning(f"Erreur configuration GPU: {e}")

    # Exécution
    results = main()

Epoch 1/10




[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.9301 - f1_score: 0.1591 - loss: 0.1305 - precision: 0.3046 - recall: 0.1153




Epoch 1: val_f1_score improved from -inf to 0.69404, saving model to models/best_keyword_model_20250626_201852.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 371ms/step - accuracy: 0.9302 - f1_score: 0.1604 - loss: 0.1302 - precision: 0.3063 - recall: 0.1163 - val_accuracy: 0.9680 - val_f1_score: 0.6940 - val_loss: 0.0532 - val_precision: 0.7586 - val_recall: 0.6396 - learning_rate: 0.0010
Epoch 2/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.9766 - f1_score: 0.7895 - loss: 0.0441 - precision: 0.8046 - recall: 0.7751
Epoch 2: val_f1_score improved from 0.69404 to 0.72211, saving model to models/best_keyword_model_20250626_201852.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 338ms/step - accuracy: 0.9766 - f1_score: 0.7895 - loss: 0.0441 - precision: 0.8046 - recall: 0.7751 - val_accuracy: 0.9700 - val_f1_score: 0.7221 - val_loss: 0.0511 - val_precision: 0.7610 - val_recall: 0.6870 - le

