1Ô∏è‚É£ Module Core - core_modules.py

In [1]:
%%writefile core_modules.py
# core_modules.py
import torch
import numpy as np
from typing import Dict, List, Optional
from dataclasses import dataclass
import logging

@dataclass
class PredictionResult:
    """Structure pour les r√©sultats de pr√©diction"""
    text: str
    predicted_label: str
    confidence: float
    all_scores: Dict[str, float]
    context: Optional[List[str]] = None
    processing_time: float = 0.0

class ClimateConfig:
    """Configuration centralis√©e"""
    def __init__(self):
        self.model_name = "distilbert-base-uncased"
        self.max_length = 256
        self.batch_size = 16
        self.learning_rate = 2e-4
        self.epochs = 3
        self.lora_r = 16
        self.lora_alpha = 32
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def to_dict(self) -> Dict:
        return {
            'model_name': self.model_name,
            'max_length': self.max_length,
            'batch_size': self.batch_size,
            'learning_rate': self.learning_rate,
            'epochs': self.epochs,
            'device': str(self.device),
            'lora_config': {'r': self.lora_r, 'alpha': self.lora_alpha}
        }

Overwriting core_modules.py


2Ô∏è‚É£ Module Data Processing - data_modules.py

In [2]:
%%writefile data_modules.py

# data_modules.py
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from typing import Tuple, Optional
import numpy as np

class DataProcessor:
    """Gestion centralis√©e du traitement des donn√©es avec gestion robuste des erreurs"""

    def __init__(self):
        self.text_col = None
        self.label_col = None
        self.label_mapping = {}

    def detect_columns(self, df: pd.DataFrame) -> Tuple[str, str]:
        """D√©tection automatique des colonnes texte et label avec validation"""
        print(f"üîç D√©tection des colonnes sur {df.shape[0]} lignes et {df.shape[1]} colonnes")
        print(f"üìã Colonnes disponibles: {list(df.columns)}")

        text_keywords = ['self_text', 'text', 'content', 'message', 'comment', 'body', 'description']
        label_keywords = ['comment_sentiment', 'sentiment', 'label', 'category', 'class', 'target']

        # Recherche intelligente
        text_col = None
        label_col = None

        # Recherche par mots-cl√©s
        for col in df.columns:
            col_lower = str(col).lower()

            # Recherche colonne texte
            if not text_col:
                for keyword in text_keywords:
                    if keyword.lower() in col_lower:
                        text_col = col
                        break

            # Recherche colonne label
            if not label_col:
                for keyword in label_keywords:
                    if keyword.lower() in col_lower:
                        label_col = col
                        break

        # Fallback intelligent pour la colonne texte
        if not text_col:
            string_cols = []
            for col in df.columns:
                try:
                    # V√©rifier si la colonne contient principalement du texte
                    sample = df[col].dropna().head(100)
                    if len(sample) > 0:
                        # Convertir en string et calculer la longueur moyenne
                        sample_str = sample.astype(str)
                        avg_length = sample_str.str.len().mean()
                        if avg_length > 10:  # Textes probablement plus longs que 10 caract√®res
                            string_cols.append((col, avg_length))
                except:
                    continue

            if string_cols:
                # Prendre la colonne avec le texte le plus long en moyenne
                text_col = max(string_cols, key=lambda x: x[1])[0]
            else:
                # Last resort: premi√®re colonne object
                object_cols = df.select_dtypes(include=['object']).columns
                if len(object_cols) > 0:
                    text_col = object_cols[0]

        # Fallback pour la colonne label
        if not label_col:
            # Chercher une colonne avec peu de valeurs uniques (potentiel label)
            for col in df.columns:
                if col != text_col:
                    try:
                        unique_count = df[col].nunique()
                        total_count = len(df[col].dropna())
                        if total_count > 0 and unique_count < min(20, total_count * 0.1):
                            label_col = col
                            break
                    except:
                        continue

            # Si toujours pas trouv√©, prendre la derni√®re colonne
            if not label_col:
                label_col = df.columns[-1]

        print(f"‚úÖ Colonnes d√©tect√©es: Text='{text_col}', Label='{label_col}'")
        return text_col, label_col

    def clean_text_column(self, series: pd.Series) -> pd.Series:
        """Nettoyage robuste d'une colonne texte"""
        try:
            # Convertir en string d'abord
            cleaned = series.astype(str)

            # Remplacer les valeurs probl√©matiques
            cleaned = cleaned.replace(['nan', 'NaN', 'None', 'null', ''], pd.NA)

            # Supprimer les espaces
            cleaned = cleaned.str.strip()

            # Remplacer les cha√Ænes vides par NaN
            cleaned = cleaned.replace('', pd.NA)

            return cleaned
        except Exception as e:
            print(f"‚ö†Ô∏è Erreur nettoyage texte: {e}")
            # Fallback: conversion simple
            return series.astype(str)

    def prepare_datasets(self, df: pd.DataFrame, sample_size: int = 8000) -> Tuple[Dataset, Dataset, Dataset]:
        """Pr√©paration des datasets avec validation robuste"""

        print(f"üìä Pr√©paration des datasets - Taille originale: {df.shape}")

        # D√©tection des colonnes
        self.text_col, self.label_col = self.detect_columns(df)

        if not self.text_col or not self.label_col:
            raise ValueError(f"‚ùå Impossible de d√©tecter les colonnes: text='{self.text_col}', label='{self.label_col}'")

        # Extraction et copie des colonnes n√©cessaires
        try:
            df_work = df[[self.text_col, self.label_col]].copy()
        except KeyError as e:
            print(f"‚ùå Colonnes manquantes: {e}")
            print(f"Colonnes disponibles: {list(df.columns)}")
            raise

        # Renommer les colonnes
        df_work.columns = ['text', 'label']

        print(f"üìã Avant nettoyage: {len(df_work)} lignes")

        # Nettoyage robuste des donn√©es
        # 1. Nettoyage de la colonne texte
        df_work['text'] = self.clean_text_column(df_work['text'])

        # 2. Nettoyage de la colonne label
        df_work['label'] = df_work['label'].astype(str).str.strip()
        df_work['label'] = df_work['label'].replace(['nan', 'NaN', 'None', 'null', ''], pd.NA)

        # 3. Suppression des lignes avec des valeurs manquantes
        initial_size = len(df_work)
        df_work = df_work.dropna()
        print(f"üßπ Apr√®s suppression des NaN: {len(df_work)} lignes (supprim√©: {initial_size - len(df_work)})")

        # 4. Filtrage des textes trop courts (de mani√®re s√©curis√©e)
        try:
            # V√©rifier que nous avons bien des strings
            df_work['text'] = df_work['text'].astype(str)

            # Filtrer les textes trop courts
            mask = df_work['text'].str.len() > 5
            df_work = df_work[mask]
            print(f"üìù Apr√®s filtrage textes courts: {len(df_work)} lignes")

        except Exception as e:
            print(f"‚ö†Ô∏è Erreur lors du filtrage des textes: {e}")
            # Continuer sans filtrage si erreur

        # V√©rification finale
        if len(df_work) == 0:
            raise ValueError("‚ùå Aucune donn√©e valide apr√®s nettoyage!")

        # 5. √âchantillonnage si n√©cessaire
        if len(df_work) > sample_size:
            df_work = df_work.sample(n=sample_size, random_state=42)
            print(f"üéØ √âchantillonnage √† {sample_size} lignes")

        # 6. Mapping des labels
        unique_labels = sorted(df_work['label'].unique())
        print(f"üè∑Ô∏è Labels uniques trouv√©s: {unique_labels}")

        self.label_mapping = {str(label): idx for idx, label in enumerate(unique_labels)}
        df_work['label_id'] = df_work['label'].astype(str).map(self.label_mapping)

        # V√©rification du mapping
        if df_work['label_id'].isna().any():
            print("‚ö†Ô∏è Probl√®me de mapping des labels d√©tect√©")
            print(f"Labels non mapp√©s: {df_work[df_work['label_id'].isna()]['label'].unique()}")

        print(f"üìä Mapping des labels: {self.label_mapping}")

        # 7. Splits stratifi√©s
        try:
            # V√©rifier si on peut faire une stratification
            if len(unique_labels) > 1 and all(df_work['label_id'].value_counts() >= 2):
                stratify_col = df_work['label_id']
                print("‚úÖ Stratification activ√©e")
            else:
                stratify_col = None
                print("‚ö†Ô∏è Pas de stratification (pas assez d'exemples par classe)")

            # Premier split: train vs (val + test)
            train_df, temp_df = train_test_split(
                df_work,
                test_size=0.4,
                random_state=42,
                stratify=stratify_col if stratify_col is not None else None
            )

            # Deuxi√®me split: val vs test
            if stratify_col is not None:
                temp_stratify = temp_df['label_id']
            else:
                temp_stratify = None

            val_df, test_df = train_test_split(
                temp_df,
                test_size=0.5,
                random_state=42,
                stratify=temp_stratify if temp_stratify is not None else None
            )

        except Exception as e:
            print(f"‚ö†Ô∏è Erreur lors du split: {e}")
            # Fallback: split simple
            train_size = int(0.6 * len(df_work))
            val_size = int(0.2 * len(df_work))

            train_df = df_work[:train_size]
            val_df = df_work[train_size:train_size+val_size]
            test_df = df_work[train_size+val_size:]

        print(f"üìä Splits finaux: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

        # 8. Conversion en Dataset
        try:
            train_dataset = Dataset.from_pandas(train_df[['text', 'label_id']].reset_index(drop=True))
            val_dataset = Dataset.from_pandas(val_df[['text', 'label_id']].reset_index(drop=True))
            test_dataset = Dataset.from_pandas(test_df[['text', 'label_id']].reset_index(drop=True))

            print("‚úÖ Datasets cr√©√©s avec succ√®s")

            return train_dataset, val_dataset, test_dataset

        except Exception as e:
            print(f"‚ùå Erreur lors de la cr√©ation des datasets: {e}")
            raise

    def get_stats(self):
        """Statistiques du processeur de donn√©es"""
        return {
            "text_column": self.text_col,
            "label_column": self.label_col,
            "label_mapping": self.label_mapping,
            "num_labels": len(self.label_mapping)
        }

    def validate_dataframe(self, df: pd.DataFrame) -> bool:
        """Validation d'un DataFrame"""
        try:
            if df is None or df.empty:
                print("‚ùå DataFrame vide ou None")
                return False

            if len(df.columns) < 2:
                print("‚ùå DataFrame doit avoir au moins 2 colonnes")
                return False

            print(f"‚úÖ DataFrame valide: {df.shape}")
            return True

        except Exception as e:
            print(f"‚ùå Erreur validation DataFrame: {e}")
            return False

Overwriting data_modules.py


3Ô∏è‚É£ Module Mod√®le - model_modules.py

In [3]:
%%writefile model_modules.py
# model_modules.py (VERSION ULTRA-STABLE - FP32 ONLY)
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import warnings
import logging

# D√©sactiver les warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

class ModelManager:
    def __init__(self, config):
        self.config = config
        self.tokenizer = None
        self.peft_model = None
        self.trainer = None

    def setup_tokenizer(self):
        """Tokenizer s√©curis√©."""
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token or '[PAD]'
            print(f"‚úÖ Tokenizer charg√© : {self.config.model_name}")
            return self.tokenizer
        except Exception as e:
            print(f"‚ùå Erreur tokenizer : {e}")
            raise

    def setup_model(self, num_labels: int):
        """Mod√®le en FP32 uniquement pour √©viter l'erreur FP16."""
        if num_labels < 2:
            raise ValueError("‚ùå num_labels doit √™tre ‚â• 2")

        try:
            print(f"üîß Chargement mod√®le FP32 pour {num_labels} classes...")

            # üîí Forcer FP32
            base_model = AutoModelForSequenceClassification.from_pretrained(
                self.config.model_name,
                num_labels=num_labels,
                torch_dtype=torch.float32,  # üîí FP32 uniquement
                device_map=None,  # Pas de device_map pour √©viter les conflits
                problem_type="single_label_classification",
            )

            target_modules = self.get_target_modules()

            lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=self.config.lora_r,
                lora_alpha=self.config.lora_alpha,
                lora_dropout=0.1,
                target_modules=target_modules,
                bias="none",
            )

            self.peft_model = get_peft_model(base_model, lora_config)

            # Affichage des param√®tres
            trainable = sum(p.numel() for p in self.peft_model.parameters() if p.requires_grad)
            total = sum(p.numel() for p in self.peft_model.parameters())
            print(f"üìä Param√®tres : {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")

            return self.peft_model

        except Exception as e:
            print(f"‚ùå Erreur mod√®le : {e}")
            raise

    def get_target_modules(self):
        """Modules cibles LoRA."""
        name = self.config.model_name.lower()
        if "distilbert" in name:
            return ["q_lin", "v_lin"]
        elif "bert" in name or "roberta" in name:
            return ["query", "value"]
        return ["query", "value", "dense"]

    def tokenize_function(self, examples):
        """Tokenisation."""
        return self.tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=self.config.max_length,
        )

    def setup_training_args(self, output_dir="outputs/runs"):
        """TrainingArguments 100% FP32."""
        os.makedirs(output_dir, exist_ok=True)
        logging_dir = os.path.join(output_dir, "logs")
        os.makedirs(logging_dir, exist_ok=True)

        return TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=self.config.epochs,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size * 2,
            learning_rate=self.config.learning_rate,
            warmup_steps=200,
            weight_decay=0.01,

            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="steps",
            logging_steps=50,
            logging_dir=logging_dir,

            load_best_model_at_end=True,
            metric_for_best_model="eval_accuracy",
            greater_is_better=True,

            # üîí D√©sactivation compl√®te de la pr√©cision mixte
            fp16=False,
            bf16=False,
            fp16_backend=None,
            half_precision_backend=None,

            gradient_checkpointing=True,
            dataloader_num_workers=2,

            save_total_limit=2,
            save_steps=500,

            report_to="none",
            remove_unused_columns=False,
            push_to_hub=False,
        )

    def setup_trainer(self, train_dataset, val_dataset):
        """Trainer s√©curis√©."""
        try:
            training_args = self.setup_training_args()
            data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

            def compute_metrics(eval_pred):
                predictions, labels = eval_pred
                preds = np.argmax(predictions, axis=1)
                return {
                    "accuracy": accuracy_score(labels, preds),
                    "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
                    "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
                    "precision": precision_score(labels, preds, average="weighted", zero_division=0),
                    "recall": recall_score(labels, preds, average="weighted", zero_division=0),
                }

            self.trainer = Trainer(
                model=self.peft_model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=self.tokenizer,
                compute_metrics=compute_metrics,
                data_collator=data_collator,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
            )

            print("‚úÖ Trainer configur√©")
            return self.trainer

        except Exception as e:
            print(f"‚ùå Erreur trainer : {e}")
            raise

Overwriting model_modules.py


4. visualization_modules.py

In [4]:
%%writefile visualization_modules.py
# visualization_modules.py
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import streamlit as st
from sklearn.metrics import classification_report, f1_score
import numpy as np

class VisualizationManager:
    """Gestion des visualisations d'entra√Ænement et d'√©valuation"""

    @staticmethod
    def plot_training_curves(log_dir: str):
        """Affiche les courbes d'entra√Ænement (garantit logs)"""
        try:
            import json
            log_file = f"{log_dir}/trainer_state.json"
            if not os.path.exists(log_file):
                st.warning("üìÑ Fichier de logs non trouv√©")
                return

            with open(log_file, 'r') as f:
                logs = json.load(f)

            history = logs.get('log_history', [])
            if not history:
                st.warning("üìâ Aucune donn√©e d'entra√Ænement trouv√©e")
                return

            epochs, train_loss, eval_loss, eval_accuracy = [], [], [], []
            for entry in history:
                if 'eval_loss' in entry:
                    epochs.append(entry.get('epoch', 0))
                    eval_loss.append(entry.get('eval_loss', 0))
                    eval_accuracy.append(entry.get('eval_accuracy', 0))
                elif 'loss' in entry:
                    train_loss.append(entry.get('loss', 0))

            fig, axes = plt.subplots(1, 2, figsize=(15, 5))
            fig.suptitle('üìà √âvolution de l\'entra√Ænement', fontsize=16)

            # Loss
            if train_loss:
                axes[0].plot(range(len(train_loss)), train_loss, 'b-', label='Train Loss', marker='o')
            if eval_loss:
                axes[0].plot(epochs[:len(eval_loss)], eval_loss, 'r-', label='Eval Loss', marker='s')
            axes[0].set_title('Perte (Loss)')
            axes[0].set_xlabel('Epoch')
            axes[0].set_ylabel('Loss')
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)

            # Accuracy
            if eval_accuracy:
                axes[1].plot(epochs[:len(eval_accuracy)], eval_accuracy, 'g-', label='Accuracy', marker='^')
            axes[1].set_title('Pr√©cision')
            axes[1].set_xlabel('Epoch')
            axes[1].set_ylabel('Accuracy')
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)

            st.pyplot(fig)

        except Exception as e:
            st.error(f"‚ùå Erreur lors de l'affichage des courbes : {e}")

    @staticmethod
    def show_confusion_matrix(trainer, test_dataset, label_names):
        """Affiche la matrice de confusion"""
        try:
            from sklearn.metrics import confusion_matrix, classification_report
            preds_output = trainer.predict(test_dataset)
            preds = preds_output.predictions.argmax(axis=1)
            labels = preds_output.label_ids

            cm = confusion_matrix(labels, preds)

            fig, ax = plt.subplots(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                        xticklabels=label_names, yticklabels=label_names, ax=ax)
            ax.set_title('Matrice de Confusion')
            ax.set_xlabel('Pr√©dictions')
            ax.set_ylabel('Vraies √©tiquettes')
            st.pyplot(fig)

            # Rapport de classification
            st.subheader("üìä Rapport de Classification")
            report = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
            report_df = pd.DataFrame(report).transpose()
            st.dataframe(report_df)

        except Exception as e:
            st.error(f"‚ùå Erreur matrice : {e}")

    @staticmethod
    def plot_class_distribution(labels, label_names=None, title="Distribution des classes"):
        """Histogramme des classes"""
        fig, ax = plt.subplots(figsize=(8, 4))
        sns.countplot(x=labels, ax=ax)
        if label_names:
            ax.set_xticklabels(label_names)
        ax.set_title(title)
        ax.set_xlabel("Classes")
        ax.set_ylabel("Nombre d'exemples")
        st.pyplot(fig)

    @staticmethod
    def plot_f1_per_class(labels_true, labels_pred, label_names):
        """Barplot F1-score par classe"""
        scores = f1_score(labels_true, labels_pred, average=None)
        fig, ax = plt.subplots(figsize=(8, 4))
        sns.barplot(x=label_names, y=scores, ax=ax)
        ax.set_title("F1-score par classe")
        ax.set_ylabel("F1-score")
        st.pyplot(fig)

Overwriting visualization_modules.py


5. qa_modules.py

In [5]:
%%writefile qa_modules.py
# qa_modules.py
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List, Dict
import streamlit as st

class QAModule:
    """Module de recherche et Q&A bas√© sur sentence-transformers"""

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """Initialisation avec mod√®le d'embedding"""
        try:
            self.encoder = SentenceTransformer(model_name)
            self.corpus_embeddings = None
            self.corpus_texts = []
            self.labels = []
            self.model_name = model_name
        except Exception as e:
            st.error(f"Erreur chargement mod√®le Q&A: {e}")
            # Fallback
            self.encoder = None
            self.model_name = "fallback"

    def fit(self, dataset):
        """Indexe le dataset pour la recherche"""
        if self.encoder is None:
            st.warning("Module Q&A non disponible")
            return

        try:
            self.corpus_texts = [item['text'] for item in dataset]
            self.labels = [item['label_id'] for item in dataset]

            with st.spinner("üìä Indexation des donn√©es pour la recherche..."):
                self.corpus_embeddings = self.encoder.encode(
                    self.corpus_texts,
                    convert_to_tensor=False,
                    show_progress_bar=True
                )
            st.success(f"‚úÖ {len(self.corpus_texts)} √©l√©ments index√©s")

        except Exception as e:
            st.error(f"Erreur indexation Q&A: {e}")

    def query(self, question: str, top_k: int = 5) -> List[Dict]:
        """Recherche les textes les plus similaires √† la question"""
        if self.encoder is None or self.corpus_embeddings is None:
            return []

        try:
            question_embedding = self.encoder.encode([question], convert_to_tensor=False)
            similarities = cosine_similarity(question_embedding, self.corpus_embeddings)[0]

            # Top K indices
            top_indices = np.argsort(similarities)[-top_k:][::-1]

            results = []
            for idx in top_indices:
                results.append({
                    "text": self.corpus_texts[idx],
                    "label_id": int(self.labels[idx]),
                    "score": float(similarities[idx]),
                    "rank": len(results) + 1
                })

            return results

        except Exception as e:
            st.error(f"Erreur recherche Q&A: {e}")
            return []

    def get_stats(self) -> Dict:
        """Statistiques du module Q&A"""
        return {
            "model_name": self.model_name,
            "indexed_items": len(self.corpus_texts),
            "embedding_dim": len(self.corpus_embeddings[0]) if self.corpus_embeddings else 0
        }

Overwriting qa_modules.py


4Ô∏è‚É£ Module Knowledge Base - knowledge_modules.py

In [6]:
%%writefile knowledge_modules.py

# knowledge_modules.py
import numpy as np
from typing import List, Optional
import re

class KnowledgeBase:
    """Gestion de la base de connaissances sans sentence-transformers"""

    def __init__(self):
        self.knowledge_base = []
        self.setup_knowledge_base()

    def setup_knowledge_base(self):
        """Configuration de la base de connaissances"""
        self.knowledge_base = [
            "Le r√©chauffement climatique est principalement caus√© par les √©missions de gaz √† effet de serre d'origine humaine.",
            "Les √©nergies renouvelables comme le solaire et l'√©olien sont essentielles pour d√©carboner notre √©conomie.",
            "La d√©forestation massive contribue significativement au changement climatique.",
            "Le secteur des transports repr√©sente environ 24% des √©missions mondiales de gaz √† effet de serre.",
            "L'am√©lioration de l'efficacit√© √©nerg√©tique des b√¢timents peut r√©duire jusqu'√† 50% de leur consommation.",
            "L'agriculture durable et r√©g√©n√©ratrice peut s√©questrer du carbone tout en produisant de la nourriture.",
            "Les oc√©ans absorbent 25% du CO2 atmosph√©rique mais s'acidifient, mena√ßant les √©cosyst√®mes marins.",
            "Les politiques de taxation du carbone incitent les entreprises √† r√©duire leurs √©missions.",
            "L'adaptation au changement climatique est aussi cruciale que l'att√©nuation des √©missions.",
            "Les technologies de capture et stockage du carbone pourraient permettre d'atteindre la neutralit√© carbone."
        ]
        print("‚úÖ Base de connaissances initialis√©e avec recherche par mots-cl√©s")

    def find_context(self, query: str, top_k: int = 3) -> List[str]:
        """Recherche de contexte pertinent par similarit√© textuelle simple"""
        if not query or not self.knowledge_base:
            return []

        try:
            # Nettoyage et tokenisation simple
            query_clean = query.lower()
            query_words = set(re.findall(r'\b\w+\b', query_clean))

            # Score de similarit√© bas√© sur les mots communs
            scored_docs = []

            for doc in self.knowledge_base:
                doc_clean = doc.lower()
                doc_words = set(re.findall(r'\b\w+\b', doc_clean))

                # Calcul du score Jaccard
                intersection = len(query_words & doc_words)
                union = len(query_words | doc_words)

                if union > 0:
                    jaccard_score = intersection / union
                    scored_docs.append((doc, jaccard_score))

            # Tri par score d√©croissant
            scored_docs.sort(key=lambda x: x[1], reverse=True)

            # Retour des top_k documents avec score > 0.1
            relevant_docs = []
            for doc, score in scored_docs[:top_k]:
                if score > 0.1:  # Seuil de pertinence
                    relevant_docs.append(doc)

            return relevant_docs

        except Exception as e:
            print(f"‚ö†Ô∏è Erreur recherche contexte: {e}")
            return []

    def add_knowledge(self, new_knowledge: str):
        """Ajouter une nouvelle connaissance"""
        if new_knowledge and new_knowledge not in self.knowledge_base:
            self.knowledge_base.append(new_knowledge)
            print(f"‚úÖ Nouvelle connaissance ajout√©e: {new_knowledge[:50]}...")

    def get_stats(self):
        """Statistiques de la base de connaissances"""
        return {
            "total_documents": len(self.knowledge_base),
            "avg_length": np.mean([len(doc) for doc in self.knowledge_base]) if self.knowledge_base else 0,
        }

Overwriting knowledge_modules.py


5Ô∏è‚É£ Module Streamlit - streamlit_app.py

In [7]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import torch
import sys
import os

sys.path.append('/content')
from core_modules import ClimateConfig
from data_modules import DataProcessor
from model_modules import ModelManager
from knowledge_modules import KnowledgeBase
from visualization_modules import VisualizationManager
from qa_modules import QAModule

st.set_page_config(page_title="üåç Climate Analyzer ‚Äì Complet", page_icon="üåç", layout="wide")

st.markdown("""
<style>.main-header{background:linear-gradient(135deg,#667eea,#764ba2);padding:2rem;border-radius:15px;color:white;text-align:center}</style>
""", unsafe_allow_html=True)

class ClimateAnalyzerApp:
    def __init__(self):
        self.config = ClimateConfig()
        self.data_processor = DataProcessor()
        self.model_manager = ModelManager(self.config)
        self.knowledge_base = KnowledgeBase()
        self.visualizer = VisualizationManager()
        self.qa_module = QAModule()

    def run(self):
        # üîÅ Persistance via session_state
        for key in ["trained", "trainer", "test_ds", "label_mapping"]:
            if key not in st.session_state:
                st.session_state[key] = None if key != "trained" else False

        st.markdown('<div class="main-header"><h1>üåç Climate Sentiment Analyzer</h1><h3>Pipeline Complet</h3></div>', unsafe_allow_html=True)
        mode = st.sidebar.selectbox("Mode", ["üöÄ Pipeline Complet", "üìä Data Processing", "‚ùì Q&A", "üìà Visualisations"])

        if mode == "üöÄ Pipeline Complet":
            self.run_complete_pipeline()
        elif mode == "üìä Data Processing":
            self.run_data_processing()
        elif mode == "‚ùì Q&A":
            self.run_qa_interface()
        elif mode == "üìà Visualisations":
            self.run_visualizations()

    def run_complete_pipeline(self):
        st.header("üöÄ Pipeline Complet")
        uploaded_file = st.file_uploader("T√©l√©chargez votre fichier CSV", type=["csv"])

        if uploaded_file:
            df = pd.read_csv(uploaded_file)
            st.success(f"‚úÖ Fichier charg√© : {df.shape[0]} lignes, {df.shape[1]} colonnes")
            st.dataframe(df.head())

            sample_size = st.slider("Taille √©chantillon", 1000, 10000, 4000)
            epochs = st.slider("Epochs", 1, 5, 3)
            self.config.epochs = epochs

            if st.button("üöÄ Lancer l'entra√Ænement", type="primary"):
                self.run_real_training(df, sample_size)

    def run_real_training(self, df, sample_size):
        progress = st.progress(0)
        status = st.empty()

        try:
            status.text("üìä Pr√©paration des donn√©es...")
            train_ds, val_ds, test_ds = self.data_processor.prepare_datasets(df, sample_size)
            progress.progress(20)

            status.text("ü§ñ Configuration du mod√®le...")
            self.model_manager.setup_tokenizer()
            num_labels = len(self.data_processor.label_mapping)
            self.model_manager.setup_model(num_labels)
            progress.progress(40)

            def prepare_dataset(ds):
                ds = ds.map(
                    self.model_manager.tokenize_function,
                    batched=True,
                    remove_columns=["text"]
                )
                ds = ds.rename_column("label_id", "labels")
                ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
                return ds

            train_ds = prepare_dataset(train_ds)
            val_ds = prepare_dataset(val_ds)
            test_ds = prepare_dataset(test_ds)
            progress.progress(60)

            trainer = self.model_manager.setup_trainer(train_ds, val_ds)
            progress.progress(70)

            with st.spinner("Entra√Ænement du mod√®le..."):
                trainer.train()
            progress.progress(90)

            metrics = trainer.evaluate(test_ds)

            qa_data = [{"text": item["text"], "label_id": item["label_id"]}
                       for item in self.data_processor.prepare_datasets(df, sample_size)[0]]
            self.qa_module.fit(qa_data)

            model_path = "outputs/final_model"
            os.makedirs(model_path, exist_ok=True)
            trainer.save_model(model_path)

            # üîÅ Mise √† jour session_state
            st.session_state.trained = True
            st.session_state.trainer = trainer
            st.session_state.test_ds = test_ds
            st.session_state.label_mapping = self.data_processor.label_mapping

            progress.progress(100)
            st.success("üéâ Entra√Ænement termin√© avec succ√®s!")

        except Exception as e:
            st.error(f"‚ùå Erreur d'entra√Ænement: {e}")

    def run_data_processing(self):
        st.header("üìä Data Processing")
        uploaded_file = st.file_uploader("T√©l√©chargez votre fichier CSV", type=["csv"])
        if uploaded_file:
            df = pd.read_csv(uploaded_file)
            st.success(f"‚úÖ Fichier charg√©: {df.shape}")
            st.dataframe(df.head())
            train_ds, val_ds, test_ds = self.data_processor.prepare_datasets(df)
            col1, col2, col3 = st.columns(3)
            col1.metric("Train", len(train_ds))
            col2.metric("Validation", len(val_ds))
            col3.metric("Test", len(test_ds))

    def run_qa_interface(self):
        st.header("‚ùì Interface Q&A")
        if not st.session_state.get("trained", False):
            st.warning("‚ö†Ô∏è Veuillez d'abord entra√Æner un mod√®le.")
            return

        question = st.text_input("Votre question:", placeholder="Ex: Quelles sont les causes du r√©chauffement climatique?")
        if question:
            results = self.qa_module.query(question, top_k=5)
            if results:
                for i, result in enumerate(results, 1):
                    with st.expander(f"R√©sultat {i} - Score: {result['score']:.3f}"):
                        st.write(f"**Texte:** {result['text']}")
                        st.write(f"**Label ID:** {result['label_id']}")
            else:
                st.warning("Aucun r√©sultat trouv√©.")

    def run_visualizations(self):
        st.header("üìà Visualisations")
        if not st.session_state.get("trained", False):
            st.warning("‚ö†Ô∏è Aucune donn√©e d'entra√Ænement disponible.")
            return

        viz_option = st.selectbox(
            "Choisir le type de visualisation:",
            ["Matrice de confusion", "Rapport de classification", "Distribution des classes", "F1-score par classe", "Courbes d'entra√Ænement"]
        )

        trainer = st.session_state.trainer
        test_ds = st.session_state.test_ds
        label_mapping = st.session_state.label_mapping
        label_names = list(label_mapping.keys())

        try:
            preds_output = trainer.predict(test_ds)
            preds = preds_output.predictions.argmax(axis=1)
            labels = preds_output.label_ids

            if viz_option == "Matrice de confusion":
                self.visualizer.show_confusion_matrix(trainer, test_ds, label_names)

            elif viz_option == "Rapport de classification":
                from sklearn.metrics import classification_report
                report = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
                st.dataframe(pd.DataFrame(report).transpose())

            elif viz_option == "Distribution des classes":
                self.visualizer.plot_class_distribution([label_mapping[i] for i in labels])

            elif viz_option == "F1-score par classe":
                self.visualizer.plot_f1_per_class(labels, preds, label_names)

            elif viz_option == "Courbes d'entra√Ænement":
                self.visualizer.plot_training_curves("outputs/runs/logs")

        except Exception as e:
            st.error(f"‚ùå Erreur de visualisation : {e}")

class PipelineOrchestrator:
    def __init__(self):
        self.app = ClimateAnalyzerApp()

    def run(self):
        self.app.run()

if __name__ == "__main__":
    orchestrator = PipelineOrchestrator()
    orchestrator.run()

Overwriting streamlit_app.py


6Ô∏è‚É£ Script d'Installation - setup_pipeline.py

In [8]:
%%writefile setup_pipeline.py
# setup_pipeline.py
import subprocess
import sys

def install_dependencies():
    """Installation compl√®te des d√©pendances"""
    packages = [
        "transformers>=4.36.0",
        "datasets>=2.16.0",
        "torch>=2.1.0",
        "peft>=0.7.0",
        "sentence-transformers>=2.2.0",
        "faiss-cpu>=1.7.0",
        "streamlit>=1.29.0",
        "plotly>=5.17.0",
        "scikit-learn>=1.3.0",
        "matplotlib>=3.7.0",
        "seaborn>=0.12.0",
        "pandas>=1.5.0",
        "numpy>=1.24.0"
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"‚úÖ {package} install√©")
        except subprocess.CalledProcessError as e:
            print(f"‚ö†Ô∏è Erreur avec {package}: {e}")

    print("‚úÖ Installation compl√®te termin√©e!")

if __name__ == "__main__":
    install_dependencies()

Overwriting setup_pipeline.py


In [9]:
!python setup_pipeline.py

‚úÖ transformers>=4.36.0 install√©
‚úÖ datasets>=2.16.0 install√©
‚úÖ torch>=2.1.0 install√©
‚úÖ peft>=0.7.0 install√©
‚úÖ sentence-transformers>=2.2.0 install√©
‚úÖ faiss-cpu>=1.7.0 install√©
‚úÖ streamlit>=1.29.0 install√©
‚úÖ plotly>=5.17.0 install√©
‚úÖ scikit-learn>=1.3.0 install√©
‚úÖ matplotlib>=3.7.0 install√©
‚úÖ seaborn>=0.12.0 install√©
‚úÖ pandas>=1.5.0 install√©
‚úÖ numpy>=1.24.0 install√©
‚úÖ Installation compl√®te termin√©e!


In [10]:
!pip install streamlit



In [11]:
!pip install pyngrok



In [12]:
# üîß Lancement Streamlit + ngrok (version corrig√©e)
import subprocess
import time
from pyngrok import ngrok

# 1Ô∏è‚É£ Token ngrok
TOKEN = "30Nciu2LDo3NzmKva2zibt2sCFL_7Ag5r9kUYyBCha12WSZ3"
!ngrok authtoken {TOKEN}

# 2Ô∏è‚É£ Lancer l'application principale
subprocess.Popen(
    ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)

# 3Ô∏è‚É£ Attendre et cr√©er le tunnel
time.sleep(5)
public_url = ngrok.connect(8501)
print("üöÄ Interface Streamlit disponible √† :")
print(public_url)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
üöÄ Interface Streamlit disponible √† :
NgrokTunnel: "https://1961fef2d158.ngrok-free.app" -> "http://localhost:8501"
