In [8]:
pip install streamlit torch transformers datasets peft scikit-learn plotly



In [9]:
%%writefile climate_app.py
import os
import csv
import time
import logging
import tempfile
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np
import pandas as pd
import streamlit as st
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px

# Configuration du logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    """Configuration centralis√©e de l'application"""
    MODEL_NAME = "t5-small"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

    # Param√®tres optimis√©s pour l'√©chantillonnage
    DEFAULT_PARAMS = {
        "sample_sizes": {
            "test_rapide": 30000,    # 10K par classe - test en 5 min
            "validation": 75000,     # 25K par classe - validation en 15 min
            "production": 150000,    # 50K par classe - mod√®le final en 30 min
            "maximum": 300000        # 100K par classe - si n√©cessaire
        },
        "chunk_size": 20000,
        "test_size": 0.2,
        "val_size": 0.125,
        "max_input_length": 256,
        "max_target_length": 16,
        "train_batch_size": 8,
        "eval_batch_size": 8,
        "num_epochs": 3,
        "learning_rate": 5e-4,
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.05,
        "eval_steps": 50,
        "logging_steps": 25
    }

    LABEL_MAPPING = {"negative": 0, "neutral": 1, "positive": 2}
    LABEL_NAMES = ["negative", "neutral", "positive"]

class SmartSampler:
    """√âchantillonneur intelligent pour gros datasets"""

    def __init__(self, config: Config):
        self.config = config

    def estimate_dataset_size(self, file_path: str) -> Dict[str, Any]:
        """Estime la taille et les caract√©ristiques du dataset"""
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

        # Estimation du nombre de lignes bas√©e sur la taille du fichier
        # R√®gle empirique : ~1KB par ligne pour du texte Reddit
        estimated_lines = int(file_size_mb * 1000)

        return {
            "file_size_mb": file_size_mb,
            "estimated_lines": estimated_lines,
            "processing_time_estimate": self._estimate_processing_time(estimated_lines)
        }

    def _estimate_processing_time(self, lines: int) -> Dict[str, str]:
        """Estime les temps de traitement selon la taille"""
        times = {}
        for size_name, sample_size in self.config.DEFAULT_PARAMS["sample_sizes"].items():
            if sample_size >= lines:
                times[size_name] = f"{int(lines / 5000)} min (dataset complet)"
            else:
                times[size_name] = f"{int(sample_size / 5000)} min"
        return times

    def create_stratified_sample(self, df: pd.DataFrame, target_size: int,
                                progress_callback=None) -> pd.DataFrame:
        """Cr√©e un √©chantillon stratifi√© intelligent"""

        if progress_callback:
            progress_callback("üîç Analyse de la distribution des classes...")

        # Analyse de la distribution
        class_counts = df['label'].value_counts().sort_index()
        total_samples = len(df)

        st.info(f"üìä Distribution originale: {dict(class_counts)}")

        # Calcul des tailles par classe pour l'√©quilibrage
        samples_per_class = target_size // 3  # 3 classes

        if progress_callback:
            progress_callback(f"üéØ Objectif: {samples_per_class:,} √©chantillons par classe")

        balanced_samples = []

        for class_label in [0, 1, 2]:  # negative, neutral, positive
            class_data = df[df['label'] == class_label]
            available_samples = len(class_data)

            if available_samples == 0:
                st.warning(f"‚ö†Ô∏è Aucun √©chantillon trouv√© pour la classe {self.config.LABEL_NAMES[class_label]}")
                continue

            # Prendre le minimum entre ce qui est disponible et ce qui est demand√©
            n_samples = min(samples_per_class, available_samples)

            if progress_callback:
                progress_callback(f"üìù √âchantillonnage classe {self.config.LABEL_NAMES[class_label]}: {n_samples:,} √©chantillons")

            # √âchantillonnage al√©atoire stratifi√©
            if n_samples < available_samples:
                sampled_class = class_data.sample(n=n_samples, random_state=42)
            else:
                sampled_class = class_data

            balanced_samples.append(sampled_class)

        # Combinaison et m√©lange final
        if progress_callback:
            progress_callback("üîÑ Combinaison des √©chantillons...")

        final_sample = pd.concat(balanced_samples, ignore_index=True)
        final_sample = final_sample.sample(frac=1, random_state=42).reset_index(drop=True)

        # Statistiques finales
        final_class_counts = final_sample['label'].value_counts().sort_index()
        st.success(f"‚úÖ √âchantillon cr√©√©: {dict(final_class_counts)} (Total: {len(final_sample):,})")

        return final_sample

class OptimizedDataProcessor:
    """Processeur de donn√©es optimis√© pour l'√©chantillonnage"""

    def __init__(self, config: Config):
        self.config = config
        self.sampler = SmartSampler(config)

    def load_and_sample_data(self, file_path: str, target_sample_size: int) -> Optional[pd.DataFrame]:
        """Charge et √©chantillonne les donn√©es de mani√®re optimis√©e"""

        try:
            # Estimation initiale
            file_info = self.sampler.estimate_dataset_size(file_path)
            st.info(f"üìÅ Fichier: {file_info['file_size_mb']:.1f} MB (~{file_info['estimated_lines']:,} lignes estim√©es)")

            # Interface de progression
            progress_bar = st.progress(0)
            status_text = st.empty()

            def update_progress(message):
                status_text.text(message)

            # Strat√©gie de chargement bas√©e sur la taille
            if file_info["file_size_mb"] > 500:  # > 500MB
                return self._load_large_file_with_sampling(
                    file_path, target_sample_size, progress_bar, update_progress
                )
            else:
                return self._load_and_sample_standard(
                    file_path, target_sample_size, progress_bar, update_progress
                )

        except Exception as e:
            st.error(f"‚ùå Erreur lors du traitement: {str(e)}")
            logger.error(f"Erreur traitement donn√©es: {e}")
            return None

    def _load_and_sample_standard(self, file_path: str, target_size: int,
                                 progress_bar, update_progress) -> Optional[pd.DataFrame]:
        """Charge un fichier standard et l'√©chantillonne"""

        update_progress("üìñ Lecture du fichier...")
        progress_bar.progress(0.2)

        # Tentative de chargement avec diff√©rents encodages
        df = None
        for encoding in ['utf-8', 'latin-1', 'cp1252']:
            try:
                df = pd.read_csv(
                    file_path,
                    encoding=encoding,
                    on_bad_lines='skip',
                    engine='python',
                    quoting=csv.QUOTE_MINIMAL
                )
                logger.info(f"‚úÖ Fichier charg√© avec encoding {encoding}")
                break
            except UnicodeDecodeError:
                continue

        if df is None:
            st.error("‚ùå Impossible de d√©coder le fichier CSV")
            return None

        progress_bar.progress(0.4)
        update_progress("üßπ Validation et nettoyage...")

        # Validation et nettoyage
        cleaned_df = self._validate_and_clean_data(df)
        if cleaned_df.empty:
            return None

        progress_bar.progress(0.6)

        # √âchantillonnage intelligent
        sampled_df = self.sampler.create_stratified_sample(
            cleaned_df, target_size, update_progress
        )

        progress_bar.progress(1.0)
        update_progress(f"‚úÖ Traitement termin√©: {len(sampled_df):,} √©chantillons")

        return sampled_df

    def _load_large_file_with_sampling(self, file_path: str, target_size: int,
                                      progress_bar, update_progress) -> Optional[pd.DataFrame]:
        """Charge un gros fichier avec √©chantillonnage par chunks"""

        update_progress("üîç Analyse du gros fichier...")

        # Premi√®re passe : estimation et √©chantillonnage des chunks
        chunk_size = self.config.DEFAULT_PARAMS["chunk_size"]
        sampled_chunks = []
        total_processed = 0

        # Calcul du ratio d'√©chantillonnage approximatif
        file_info = self.sampler.estimate_dataset_size(file_path)
        if file_info["estimated_lines"] > target_size:
            chunk_sample_ratio = target_size / file_info["estimated_lines"] * 2  # x2 pour avoir de la marge
        else:
            chunk_sample_ratio = 1.0

        try:
            # Lecture par chunks avec √©chantillonnage
            for encoding in ['utf-8', 'latin-1', 'cp1252']:
                try:
                    chunk_reader = pd.read_csv(
                        file_path,
                        encoding=encoding,
                        chunksize=chunk_size,
                        on_bad_lines='skip',
                        engine='python',
                        quoting=csv.QUOTE_MINIMAL
                    )

                    for i, chunk in enumerate(chunk_reader):
                        # Validation du premier chunk
                        if i == 0:
                            if not self._validate_columns(chunk):
                                return None

                        # Nettoyage du chunk
                        cleaned_chunk = self._clean_chunk(chunk)
                        if len(cleaned_chunk) == 0:
                            continue

                        # √âchantillonnage du chunk si n√©cessaire
                        if chunk_sample_ratio < 1.0:
                            n_samples = max(1, int(len(cleaned_chunk) * chunk_sample_ratio))
                            cleaned_chunk = cleaned_chunk.sample(n=n_samples, random_state=42)

                        sampled_chunks.append(cleaned_chunk)
                        total_processed += len(chunk)

                        # Mise √† jour de la progression
                        progress = min(0.4 + (i * 0.4 / 100), 0.8)  # 40-80% pour le chargement
                        progress_bar.progress(progress)
                        update_progress(f"üìä Chunks trait√©s: {i+1} - Lignes: {total_processed:,}")

                        # Arr√™t si on a assez de donn√©es
                        total_samples = sum(len(chunk) for chunk in sampled_chunks)
                        if total_samples >= target_size * 3:  # x3 pour avoir de la marge avant l'√©quilibrage
                            break

                    break  # Succ√®s avec cet encoding

                except UnicodeDecodeError:
                    continue

            if not sampled_chunks:
                st.error("‚ùå Aucune donn√©e valide trouv√©e")
                return None

            # Combinaison des chunks
            progress_bar.progress(0.85)
            update_progress("üîÑ Assemblage des donn√©es...")

            combined_df = pd.concat(sampled_chunks, ignore_index=True)

            # √âchantillonnage final stratifi√©
            progress_bar.progress(0.9)
            final_sample = self.sampler.create_stratified_sample(
                combined_df, target_size, update_progress
            )

            progress_bar.progress(1.0)
            update_progress(f"‚úÖ Gros fichier trait√©: {len(final_sample):,} √©chantillons finaux")

            return final_sample

        except Exception as e:
            st.error(f"‚ùå Erreur lors du traitement du gros fichier: {str(e)}")
            return None

    def _validate_columns(self, df: pd.DataFrame) -> bool:
        """Valide la pr√©sence des colonnes requises"""
        required_cols = ["comment_sentiment", "post_title", "self_text"]
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            st.error(f"‚ùå Colonnes manquantes: {missing_cols}")
            return False
        return True

    def _validate_and_clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Valide et nettoie un DataFrame complet"""
        if not self._validate_columns(df):
            return pd.DataFrame()
        return self._clean_data(df)

    def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Nettoie les donn√©es"""
        initial_count = len(df)

        # Filtrage des labels valides
        valid_labels = set(self.config.LABEL_MAPPING.keys())
        df = df[df["comment_sentiment"].isin(valid_labels)]

        # Suppression des valeurs manquantes
        df = df.dropna(subset=["comment_sentiment", "post_title", "self_text"])

        # Cr√©ation des labels num√©riques
        df["label"] = df["comment_sentiment"].map(self.config.LABEL_MAPPING).astype(int)

        # Cr√©ation du texte combin√©
        df["text"] = (
            df["post_title"].fillna("") + " " + df["self_text"].fillna("")
        ).str.strip()

        # Filtrage des textes vides
        df = df[df["text"].str.len() > 0]

        final_count = len(df)
        if initial_count > 0:
            logger.info(f"Donn√©es nettoy√©es: {initial_count} ‚Üí {final_count} √©chantillons ({final_count/initial_count*100:.1f}% conserv√©s)")

        return df[["text", "label", "comment_sentiment"]]

    def _clean_chunk(self, chunk: pd.DataFrame) -> pd.DataFrame:
        """Nettoie un chunk de donn√©es"""
        return self._clean_data(chunk)

    def split_data(self, df: pd.DataFrame, test_size: float, val_size: float) -> Dict[str, pd.DataFrame]:
        """Divise les donn√©es en train/val/test avec stratification"""
        try:
            train_val, test = train_test_split(
                df, test_size=test_size, random_state=42, stratify=df["label"]
            )

            train, val = train_test_split(
                train_val, test_size=val_size, random_state=42, stratify=train_val["label"]
            )

            return {"train": train, "validation": val, "test": test}

        except Exception as e:
            st.error(f"‚ùå Erreur lors de la division des donn√©es: {str(e)}")
            return {}

class ModelManager:
    """Gestionnaire du mod√®le et de l'entra√Ænement optimis√©"""

    def __init__(self, config: Config):
        self.config = config
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_logs = {"train_loss": [], "val_loss": [], "steps": [], "epoch": []}

    def initialize_model(self, lora_params: Dict[str, Any]) -> bool:
        """Initialise le mod√®le avec gestion d'erreurs"""
        try:
            with st.spinner("ü§ñ Chargement du tokenizer..."):
                self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME)
                if self.tokenizer.pad_token is None:
                    self.tokenizer.pad_token = self.tokenizer.eos_token

            with st.spinner("üß† Chargement du mod√®le de base..."):
                base_model = AutoModelForSeq2SeqLM.from_pretrained(
                    self.config.MODEL_NAME,
                    torch_dtype=self.config.TORCH_DTYPE,
                    device_map=None
                ).to(self.config.DEVICE)

            with st.spinner("üîß Configuration LoRA..."):
                lora_config = LoraConfig(
                    task_type=TaskType.SEQ_2_SEQ_LM,
                    r=lora_params["lora_r"],
                    lora_alpha=lora_params["lora_alpha"],
                    target_modules=["q", "v"],
                    lora_dropout=lora_params["lora_dropout"],
                    bias="none"
                )

                self.model = get_peft_model(base_model, lora_config)

            # Affichage des informations du mod√®le
            trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
            total_params = sum(p.numel() for p in self.model.parameters())

            st.success(f"‚úÖ Mod√®le initialis√©!")
            st.info(f"üìä Param√®tres entra√Ænables: {trainable_params:,} / {total_params:,} ({trainable_params/total_params*100:.2f}%)")

            logger.info("‚úÖ Mod√®le initialis√© avec succ√®s")
            return True

        except Exception as e:
            st.error(f"‚ùå Erreur d'initialisation du mod√®le: {str(e)}")
            logger.error(f"Erreur initialisation mod√®le: {e}")
            return False

    def preprocess_data(self, examples: Dict[str, List], max_input_length: int, max_target_length: int):
        """Pr√©processe les donn√©es pour l'entra√Ænement"""
        inputs = [f"classify sentiment: {text}" for text in examples["text"]]
        targets = [self.config.LABEL_NAMES[label] for label in examples["label"]]

        model_inputs = self.tokenizer(
            inputs,
            max_length=max_input_length,
            truncation=True,
            padding=True,
            return_tensors="pt" if len(inputs) == 1 else None
        )

        labels = self.tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding=True,
            return_tensors="pt" if len(targets) == 1 else None
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def compute_metrics(self, eval_pred) -> Dict[str, float]:
        """Calcule les m√©triques d'√©valuation de mani√®re robuste"""
        try:
            predictions = eval_pred.predictions[0]
            labels = eval_pred.label_ids

            # D√©codage des pr√©dictions
            decoded_preds = []
            for pred in predictions:
                if isinstance(pred, np.ndarray):
                    pred_ids = np.argmax(pred, axis=-1) if pred.ndim > 1 else pred
                else:
                    pred_ids = pred

                decoded_text = self.tokenizer.decode(pred_ids, skip_special_tokens=True).strip().lower()

                # Mapping robuste des pr√©dictions
                if "negative" in decoded_text:
                    decoded_preds.append(0)
                elif "positive" in decoded_text:
                    decoded_preds.append(2)
                else:
                    decoded_preds.append(1)  # neutral par d√©faut

            # D√©codage des labels
            decoded_labels = []
            for label in labels:
                if hasattr(label, '__iter__') and not isinstance(label, str):
                    label_text = self.tokenizer.decode(label, skip_special_tokens=True).strip().lower()
                    if "negative" in label_text:
                        decoded_labels.append(0)
                    elif "positive" in label_text:
                        decoded_labels.append(2)
                    else:
                        decoded_labels.append(1)
                else:
                    decoded_labels.append(int(label))

            return {
                "accuracy": accuracy_score(decoded_labels, decoded_preds),
                "f1_weighted": f1_score(decoded_labels, decoded_preds, average="weighted"),
                "f1_macro": f1_score(decoded_labels, decoded_preds, average="macro")
            }

        except Exception as e:
            logger.error(f"Erreur calcul m√©triques: {e}")
            return {"accuracy": 0.0, "f1_weighted": 0.0, "f1_macro": 0.0}

    def setup_trainer(self, datasets: Dict[str, Dataset], training_params: Dict[str, Any]) -> bool:
        """Configure le trainer optimis√©"""
        try:
            # Callback pour tracker les losses
            class LossTrackingCallback(TrainerCallback):
                def __init__(self, logs_dict):
                    self.logs = logs_dict

                def on_log(self, args, state, control, logs=None, **kwargs):
                    if logs:
                        if "loss" in logs:
                            self.logs["train_loss"].append(logs["loss"])
                            self.logs["steps"].append(state.global_step)
                            self.logs["epoch"].append(state.epoch)
                        if "eval_loss" in logs:
                            self.logs["val_loss"].append(logs["eval_loss"])

            # Configuration optimis√©e de l'entra√Ænement
            training_args = TrainingArguments(
                output_dir="./lora_climate_model",
                per_device_train_batch_size=training_params["train_batch_size"],
                per_device_eval_batch_size=training_params["eval_batch_size"],
                num_train_epochs=training_params["num_epochs"],
                learning_rate=training_params["learning_rate"],
                warmup_steps=100,  # Warm-up pour stabiliser l'entra√Ænement
                eval_strategy="steps",
                eval_steps=training_params["eval_steps"],
                logging_steps=training_params["logging_steps"],
                save_strategy="steps",
                save_steps=training_params["eval_steps"],
                load_best_model_at_end=True,
                metric_for_best_model="eval_f1_weighted",
                greater_is_better=True,
                report_to=None,
                dataloader_pin_memory=False,
                remove_unused_columns=True,
                push_to_hub=False,
                fp16=torch.cuda.is_available(),  # Optimisation m√©moire si GPU
            )

            self.trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=datasets["train"],
                eval_dataset=datasets["validation"],
                tokenizer=self.tokenizer,
                compute_metrics=self.compute_metrics,
                callbacks=[LossTrackingCallback(self.training_logs)]
            )

            return True

        except Exception as e:
            st.error(f"‚ùå Erreur configuration trainer: {str(e)}")
            logger.error(f"Erreur setup trainer: {e}")
            return False

def create_streamlit_app():
    """Interface Streamlit optimis√©e pour l'Option A"""

    st.set_page_config(
        page_title="üåç Climate Sentiment AI - Option A",
        page_icon="üåç",
        layout="wide",
        initial_sidebar_state="expanded"
    )

    st.title("üåç Climate Sentiment AI - Option A : √âchantillonnage Intelligent")
    st.markdown("*Optimis√© pour traiter efficacement des datasets de 1,2 Go avec √©chantillonnage stratifi√©*")
    st.markdown("---")

    # Initialisation des objets
    config = Config()
    data_processor = OptimizedDataProcessor(config)
    model_manager = ModelManager(config)

    # Sidebar optimis√©e pour l'√©chantillonnage
    st.sidebar.header("‚öôÔ∏è Configuration Option A")

    # S√©lection de la strat√©gie d'√©chantillonnage
    st.sidebar.subheader("üéØ Strat√©gie d'√©chantillonnage")

    sample_strategies = {
        "üöÄ Test rapide (30K)": {
            "size": config.DEFAULT_PARAMS["sample_sizes"]["test_rapide"],
            "description": "Validation rapide en 5 min",
            "use_case": "Test de faisabilit√©"
        },
        "‚úÖ Validation (75K)": {
            "size": config.DEFAULT_PARAMS["sample_sizes"]["validation"],
            "description": "√âquilibre temps/qualit√© en 15 min",
            "use_case": "D√©veloppement et test"
        },
        "üéØ Production (150K)": {
            "size": config.DEFAULT_PARAMS["sample_sizes"]["production"],
            "description": "Mod√®le final en 30 min",
            "use_case": "Mod√®le de production"
        },
        "üî• Maximum (300K)": {
            "size": config.DEFAULT_PARAMS["sample_sizes"]["maximum"],
            "description": "Performance maximale en 60 min",
            "use_case": "Si qualit√© insuffisante"
        }
    }

    selected_strategy = st.sidebar.selectbox(
        "Choisir la strat√©gie",
        options=list(sample_strategies.keys()),
        index=2,  # Production par d√©faut
        help="Choisissez selon vos contraintes de temps et qualit√©"
    )

    strategy_info = sample_strategies[selected_strategy]
    target_sample_size = strategy_info["size"]

    # Affichage des informations de la strat√©gie
    st.sidebar.info(f"""
    **{selected_strategy}**

    üìä √âchantillons: {target_sample_size:,}
    ‚è±Ô∏è Temps estim√©: {strategy_info['description']}
    üéØ Usage: {strategy_info['use_case']}
    """)

    # Upload de fichier
    st.sidebar.subheader("üìÅ Fichier de donn√©es")
    uploaded_file = st.sidebar.file_uploader(
        "Charger fichier CSV",
        type=["csv"],
        help="Fichier avec colonnes: comment_sentiment, post_title, self_text"
    )

    # Option de chemin local pour tr√®s gros fichiers
    st.sidebar.markdown("**Pour fichiers > 200MB:**")
    local_file_path = st.sidebar.text_input(
        "Chemin fichier local",
        placeholder="/path/to/large_file.csv",
        help="Contourner la limite Streamlit"
    )
    use_local_file = st.sidebar.button("üìÇ Utiliser fichier local")

    # Param√®tres d'entra√Ænement
    st.sidebar.subheader("üèãÔ∏è Param√®tres d'entra√Ænement")
    num_epochs = st.sidebar.slider("√âpoques", 1, 5, config.DEFAULT_PARAMS["num_epochs"])
    learning_rate = st.sidebar.select_slider(
        "Learning rate",
        options=[1e-5, 5e-5, 1e-4, 5e-4, 1e-3],
        value=config.DEFAULT_PARAMS["learning_rate"],
        format_func=lambda x: f"{x:.0e}"
    )
    batch_size = st.sidebar.selectbox("Batch size", [4, 8, 16], index=1)

    # Interface principale
    col1, col2 = st.columns([2, 1])

    with col1:
        st.header("üìä Tableau de bord")

        # D√©termination de la source de donn√©es
        data_source = None
        tmp_file_path = None

        if uploaded_file is not None:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
                tmp_file.write(uploaded_file.getvalue())
                tmp_file_path = tmp_file.name
            data_source = tmp_file_path

        elif use_local_file and local_file_path.strip():
            if os.path.exists(local_file_path.strip()):
                data_source = local_file_path.strip()
            else:
                st.error(f"‚ùå Fichier non trouv√©: {local_file_path}")

        # Traitement des donn√©es
        if data_source:
            try:
                st.subheader("üéØ √âchantillonnage intelligent")

                # Traitement des donn√©es avec √©chantillonnage
                df = data_processor.load_and_sample_data(data_source, target_sample_size)

                if df is not None:
                    # Division des donn√©es
                    data_splits = data_processor.split_data(
                        df,
                        config.DEFAULT_PARAMS["test_size"],
                        config.DEFAULT_PARAMS["val_size"]
                    )

                    if data_splits:
                        # Statistiques d√©taill√©es
                        st.subheader("üìà Statistiques de l'√©chantillon")

                        # M√©triques principales
                        metrics_col1, metrics_col2, metrics_col3, metrics_col4 = st.columns(4)

                        with metrics_col1:
                            st.metric("üìä Total", f"{len(df):,}")
                        with metrics_col2:
                            st.metric("üèãÔ∏è Train", f"{len(data_splits['train']):,}")
                        with metrics_col3:
                            st.metric("‚úÖ Validation", f"{len(data_splits['validation']):,}")
                        with metrics_col4:
                            st.metric("üß™ Test", f"{len(data_splits['test']):,}")

                        # Visualisations
                        viz_col1, viz_col2 = st.columns(2)

                        with viz_col1:
                            # Distribution des sentiments
                            sentiment_counts = df['comment_sentiment'].value_counts()
                            fig_pie = px.pie(
                                values=sentiment_counts.values,
                                names=sentiment_counts.index,
                                title="Distribution des sentiments",
                                color_discrete_map={
                                    'negative': '#ff6b6b',
                                    'neutral': '#ffd93d',
                                    'positive': '#6bcf7f'
                                }
                            )
                            st.plotly_chart(fig_pie, use_container_width=True)

                        with viz_col2:
                            # Distribution par split
                            split_data = []
                            for split_name, split_df in data_splits.items():
                                for sentiment in ['negative', 'neutral', 'positive']:
                                    count = len(split_df[split_df['comment_sentiment'] == sentiment])
                                    split_data.append({
                                        'Split': split_name.capitalize(),
                                        'Sentiment': sentiment,
                                        'Count': count
                                    })

                            split_df_viz = pd.DataFrame(split_data)
                            fig_bar = px.bar(
                                split_df_viz,
                                x='Split',
                                y='Count',
                                color='Sentiment',
                                title="Distribution par split",
                                color_discrete_map={
                                    'negative': '#ff6b6b',
                                    'neutral': '#ffd93d',
                                    'positive': '#6bcf7f'
                                }
                            )
                            st.plotly_chart(fig_bar, use_container_width=True)

                        # Aper√ßu des donn√©es
                        st.subheader("üëÄ Aper√ßu des donn√©es")
                        sample_data = df.sample(n=min(5, len(df)), random_state=42)

                        for idx, row in sample_data.iterrows():
                            sentiment_color = {
                                'negative': 'üî¥',
                                'neutral': 'üü°',
                                'positive': 'üü¢'
                            }

                            with st.expander(f"{sentiment_color[row['comment_sentiment']]} {row['comment_sentiment'].upper()} - √âchantillon {idx}"):
                                st.write(f"**Texte:** {row['text'][:200]}...")

                        # Stockage dans session state
                        st.session_state["data_splits"] = data_splits
                        st.session_state["data_ready"] = True
                        st.session_state["sample_strategy"] = selected_strategy

                        # Informations sur la strat√©gie utilis√©e
                        st.success(f"‚úÖ {selected_strategy} appliqu√©e avec succ√®s!")

            finally:
                # Nettoyage du fichier temporaire
                if tmp_file_path and os.path.exists(tmp_file_path):
                    os.unlink(tmp_file_path)

        elif "data_ready" not in st.session_state:
            st.info("üëÜ Veuillez charger un fichier CSV pour commencer l'√©chantillonnage intelligent")

            # Guide d'utilisation
            st.markdown("""
            ### üéØ Guide d'utilisation Option A

            **1. Choisissez votre strat√©gie d'√©chantillonnage:**
            - üöÄ **Test rapide (30K)**: Pour valider rapidement votre pipeline
            - ‚úÖ **Validation (75K)**: Bon √©quilibre pour le d√©veloppement
            - üéØ **Production (150K)**: Mod√®le final de qualit√© production
            - üî• **Maximum (300K)**: Performance maximale si n√©cessaire

            **2. Chargez vos donn√©es:**
            - Fichiers < 200MB: Upload direct
            - Fichiers > 200MB: Chemin local

            **3. L'algorithme va:**
            - Analyser votre dataset
            - Cr√©er un √©chantillon √©quilibr√© et repr√©sentatif
            - Optimiser pour vos contraintes de temps
            """)

    with col2:
        st.header("üöÄ Actions")

        # Informations sur le GPU/CPU
        device_info = "üî• GPU" if config.DEVICE == "cuda" else "üíª CPU"
        st.info(f"**Dispositif:** {device_info}")

        if "data_ready" in st.session_state:
            strategy_used = st.session_state.get("sample_strategy", "Non d√©finie")
            st.success(f"**Strat√©gie:** {strategy_used}")

        # Bouton d'initialisation du mod√®le
        if st.button("ü§ñ Initialiser Mod√®le", use_container_width=True):
            if "data_ready" in st.session_state:
                lora_params = {
                    "lora_r": config.DEFAULT_PARAMS["lora_r"],
                    "lora_alpha": config.DEFAULT_PARAMS["lora_alpha"],
                    "lora_dropout": config.DEFAULT_PARAMS["lora_dropout"]
                }

                if model_manager.initialize_model(lora_params):
                    st.session_state["model_ready"] = True
            else:
                st.warning("‚ö†Ô∏è Chargez d'abord les donn√©es")

        # Bouton d'entra√Ænement
        if st.button("üèãÔ∏è Lancer Entra√Ænement", use_container_width=True):
            if "model_ready" in st.session_state and "data_ready" in st.session_state:

                # Estimation du temps d'entra√Ænement
                data_splits = st.session_state["data_splits"]
                train_size = len(data_splits["train"])
                estimated_time = (train_size * num_epochs * batch_size) // 1000  # Estimation approximative

                st.info(f"‚è±Ô∏è Temps estim√©: ~{estimated_time} minutes")

                with st.spinner("üèãÔ∏è Entra√Ænement en cours..."):
                    try:
                        # Pr√©paration des datasets
                        datasets = {}

                        for split_name, split_data in data_splits.items():
                            dataset = Dataset.from_pandas(split_data)
                            dataset = dataset.map(
                                lambda x: model_manager.preprocess_data(
                                    x,
                                    config.DEFAULT_PARAMS["max_input_length"],
                                    config.DEFAULT_PARAMS["max_target_length"]
                                ),
                                batched=True,
                                remove_columns=split_data.columns.tolist()
                            )
                            dataset.set_format("torch")
                            datasets[split_name] = dataset

                        # Configuration du trainer
                        training_params = {
                            "train_batch_size": batch_size,
                            "eval_batch_size": batch_size,
                            "num_epochs": num_epochs,
                            "learning_rate": learning_rate,
                            "eval_steps": config.DEFAULT_PARAMS["eval_steps"],
                            "logging_steps": config.DEFAULT_PARAMS["logging_steps"]
                        }

                        if model_manager.setup_trainer(datasets, training_params):
                            # Lancement de l'entra√Ænement
                            start_time = time.time()
                            model_manager.trainer.train()
                            training_time = time.time() - start_time

                            st.session_state["training_complete"] = True
                            st.session_state["training_time"] = training_time

                            st.success(f"‚úÖ Entra√Ænement termin√© en {training_time/60:.1f} minutes!")

                    except Exception as e:
                        st.error(f"‚ùå Erreur pendant l'entra√Ænement: {str(e)}")
                        logger.error(f"Erreur entra√Ænement: {e}")
            else:
                st.warning("‚ö†Ô∏è Initialisez d'abord le mod√®le")

        # Bouton d'√©valuation
        if st.button("üìä √âvaluer sur Test", use_container_width=True):
            if "training_complete" in st.session_state:
                with st.spinner("üìä √âvaluation en cours..."):
                    try:
                        data_splits = st.session_state["data_splits"]
                        test_dataset = Dataset.from_pandas(data_splits["test"])
                        test_dataset = test_dataset.map(
                            lambda x: model_manager.preprocess_data(
                                x,
                                config.DEFAULT_PARAMS["max_input_length"],
                                config.DEFAULT_PARAMS["max_target_length"]
                            ),
                            batched=True,
                            remove_columns=data_splits["test"].columns.tolist()
                        )
                        test_dataset.set_format("torch")

                        results = model_manager.trainer.evaluate(test_dataset)
                        st.session_state["test_results"] = results

                        # Affichage des r√©sultats avec contexte
                        st.subheader("üéØ R√©sultats finaux")

                        col1, col2, col3 = st.columns(3)
                        with col1:
                            acc = results.get('eval_accuracy', 0)
                            st.metric("üéØ Accuracy", f"{acc:.3f}", delta=f"{(acc-0.33)*100:+.1f}%" if acc > 0.33 else None)
                        with col2:
                            f1w = results.get('eval_f1_weighted', 0)
                            st.metric("üìä F1 Weighted", f"{f1w:.3f}")
                        with col3:
                            f1m = results.get('eval_f1_macro', 0)
                            st.metric("üìà F1 Macro", f"{f1m:.3f}")

                        # Interpr√©tation des r√©sultats
                        if acc > 0.75:
                            st.success("üéâ Excellents r√©sultats! Mod√®le pr√™t pour la production.")
                        elif acc > 0.65:
                            st.info("‚úÖ Bons r√©sultats. Consid√©rez la strat√©gie 'Maximum' pour am√©liorer.")
                        else:
                            st.warning("‚ö†Ô∏è R√©sultats moyens. Essayez avec plus de donn√©es ou ajustez les param√®tres.")

                    except Exception as e:
                        st.error(f"‚ùå Erreur pendant l'√©valuation: {str(e)}")
                        logger.error(f"Erreur √©valuation: {e}")
            else:
                st.warning("‚ö†Ô∏è Terminez d'abord l'entra√Ænement")

        # Informations de performance
        if "training_complete" in st.session_state:
            st.markdown("---")
            st.subheader("‚ö° Performance")

            training_time = st.session_state.get("training_time", 0)
            strategy_used = st.session_state.get("sample_strategy", "Non d√©finie")

            st.metric("‚è±Ô∏è Temps d'entra√Ænement", f"{training_time/60:.1f} min")
            st.info(f"**Strat√©gie utilis√©e:** {strategy_used}")

    # Onglets pour les visualisations avanc√©es
    if "training_complete" in st.session_state:
        st.markdown("---")
        tab1, tab2, tab3 = st.tabs(["üìà Courbes d'apprentissage", "üîç Test interactif", "üìã Rapport d√©taill√©"])

        with tab1:
            if model_manager.training_logs["train_loss"]:
                # Graphique des losses
                fig = go.Figure()

                fig.add_trace(go.Scatter(
                    x=model_manager.training_logs["steps"],
                    y=model_manager.training_logs["train_loss"],
                    mode='lines+markers',
                    name='Train Loss',
                    line=dict(color='blue', width=2)
                ))

                if model_manager.training_logs["val_loss"]:
                    val_steps = model_manager.training_logs["steps"][:len(model_manager.training_logs["val_loss"])]
                    fig.add_trace(go.Scatter(
                        x=val_steps,
                        y=model_manager.training_logs["val_loss"],
                        mode='lines+markers',
                        name='Validation Loss',
                        line=dict(color='red', width=2)
                    ))

                fig.update_layout(
                    title="√âvolution des losses pendant l'entra√Ænement",
                    xaxis_title="Steps",
                    yaxis_title="Loss",
                    hovermode='x unified',
                    template="plotly_white"
                )

                st.plotly_chart(fig, use_container_width=True)

                # Analyse de la convergence
                if len(model_manager.training_logs["train_loss"]) > 5:
                    last_losses = model_manager.training_logs["train_loss"][-5:]
                    loss_trend = (last_losses[-1] - last_losses[0]) / last_losses[0] * 100

                    if loss_trend < -1:
                        st.success(f"üìà Mod√®le en cours d'am√©lioration (-{abs(loss_trend):.1f}% sur les derniers steps)")
                    elif loss_trend > 1:
                        st.warning(f"üìâ Loss en augmentation (+{loss_trend:.1f}% - possible surentra√Ænement)")
                    else:
                        st.info("üìä Loss stabilis√©e - Convergence atteinte")
            else:
                st.info("Aucune donn√©e d'entra√Ænement disponible")

        with tab2:
            st.subheader("üîç Test de pr√©diction interactif")

            # Exemples pr√©d√©finis
            example_texts = {
                "N√©gatif": "Climate change is destroying our planet and governments are doing nothing about it!",
                "Neutre": "Scientists published a new study about climate change impacts on weather patterns.",
                "Positif": "Great progress on renewable energy! Solar panels are becoming more efficient and affordable."
            }

            col1, col2 = st.columns([2, 1])

            with col1:
                test_text = st.text_area(
                    "Entrez un texte √† classifier:",
                    value=example_texts["Neutre"],
                    height=100
                )

            with col2:
                st.write("**Exemples:**")
                for label, text in example_texts.items():
                    if st.button(f"üìù {label}", key=f"example_{label}"):
                        st.session_state["test_text"] = text
                        st.experimental_rerun()

            if st.session_state.get("test_text"):
                test_text = st.session_state["test_text"]

            if st.button("üéØ Pr√©dire", use_container_width=True) and test_text.strip():
                try:
                    # Pr√©paration de l'input
                    input_text = f"classify sentiment: {test_text}"
                    inputs = model_manager.tokenizer(
                        input_text,
                        return_tensors="pt",
                        max_length=config.DEFAULT_PARAMS["max_input_length"],
                        truncation=True,
                        padding=True
                    ).to(config.DEVICE)

                    # Pr√©diction avec probabilit√©s
                    with torch.no_grad():
                        outputs = model_manager.model.generate(
                            **inputs,
                            max_length=config.DEFAULT_PARAMS["max_target_length"],
                            num_beams=3,
                            early_stopping=True,
                            return_dict_in_generate=True,
                            output_scores=True
                        )

                    predicted_text = model_manager.tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

                    # Affichage du r√©sultat avec style
                    sentiment_styles = {
                        "negative": {"color": "#ff6b6b", "icon": "üî¥", "bg": "#ffe6e6"},
                        "neutral": {"color": "#ffd93d", "icon": "üü°", "bg": "#fffacd"},
                        "positive": {"color": "#6bcf7f", "icon": "üü¢", "bg": "#e6ffe6"}
                    }

                    predicted_sentiment = predicted_text.lower()
                    if predicted_sentiment in sentiment_styles:
                        style = sentiment_styles[predicted_sentiment]

                        st.markdown(f"""
                        <div style="padding: 20px; border-radius: 10px; background-color: {style['bg']}; border-left: 5px solid {style['color']};">
                            <h3 style="color: {style['color']}; margin: 0;">
                                {style['icon']} Pr√©diction: {predicted_sentiment.upper()}
                            </h3>
                            <p style="margin: 10px 0 0 0; font-style: italic;">"{test_text}"</p>
                        </div>
                        """, unsafe_allow_html=True)
                    else:
                        st.success(f"**Pr√©diction:** {predicted_text.upper()}")

                except Exception as e:
                    st.error(f"‚ùå Erreur de pr√©diction: {str(e)}")

        with tab3:
            st.subheader("üìã Rapport d√©taill√© de l'entra√Ænement")

            # R√©sum√© de la configuration
            config_summary = {
                "Strat√©gie d'√©chantillonnage": st.session_state.get("sample_strategy", "Non d√©finie"),
                "Taille de l'√©chantillon": f"{len(st.session_state.get('data_splits', {}).get('train', [])):,} (train)",
                "√âpoques": num_epochs,
                "Learning rate": f"{learning_rate:.0e}",
                "Batch size": batch_size,
                "Dispositif": config.DEVICE.upper(),
                "Temps d'entra√Ænement": f"{st.session_state.get('training_time', 0)/60:.1f} min"
            }

            col1, col2 = st.columns(2)

            with col1:
                st.markdown("**Configuration:**")
                for key, value in config_summary.items():
                    st.write(f"‚Ä¢ **{key}:** {value}")

            with col2:
                if "test_results" in st.session_state:
                    results = st.session_state["test_results"]
                    st.markdown("**M√©triques finales:**")
                    st.write(f"‚Ä¢ **Accuracy:** {results.get('eval_accuracy', 0):.3f}")
                    st.write(f"‚Ä¢ **F1 Weighted:** {results.get('eval_f1_weighted', 0):.3f}")
                    st.write(f"‚Ä¢ **F1 Macro:** {results.get('eval_f1_macro', 0):.3f}")
                    st.write(f"‚Ä¢ **Loss finale:** {results.get('eval_loss', 0):.3f}")

            # Recommandations
            st.markdown("---")
            st.markdown("**üéØ Recommandations pour am√©liorer les performances:**")

            if "test_results" in st.session_state:
                acc = st.session_state["test_results"].get('eval_accuracy', 0)

                if acc < 0.65:
                    st.markdown("""
                    - üìà **Augmenter la taille de l'√©chantillon** (strat√©gie Maximum)
                    - üîß **Ajuster le learning rate** (essayer 1e-4 ou 1e-3)
                    - üìö **Augmenter le nombre d'√©poques** (5-7 √©poques)
                    - üéØ **V√©rifier la qualit√© des donn√©es** (textes trop courts, labels incorrects)
                    """)
                elif acc < 0.75:
                    st.markdown("""
                    - ‚úÖ **Bon mod√®le!** Consid√©rez la strat√©gie Maximum pour gagner quelques points
                    - üîß **Fine-tuning des hyperparam√®tres** (LoRA rank, alpha)
                    - üìä **Analyse des erreurs** sur les pr√©dictions incorrectes
                    """)
                else:
                    st.markdown("""
                    - üéâ **Excellent mod√®le!** Pr√™t pour la production
                    - üíæ **Sauvegarder le mod√®le** pour utilisation future
                    - üöÄ **D√©ploiement recommand√©**
                    """)

if __name__ == "__main__":
    create_streamlit_app()

Overwriting climate_app.py


In [10]:
get_ipython().system_raw('streamlit run climate_app.py --server.port=8501 --server.address=0.0.0.0 &')

In [11]:
pip install pyngrok



In [12]:
import os, time, subprocess, socket
from pyngrok import ngrok

# 1Ô∏è‚É£ Nettoie tout
subprocess.run(["pkill", "-9", "-f", "streamlit"], stderr=subprocess.DEVNULL)
subprocess.run(["pkill", "-9", "-f", "ngrok"],   stderr=subprocess.DEVNULL)
ngrok.kill()
time.sleep(2)

# 2Ô∏è‚É£ D√©marre Streamlit en arri√®re-plan
subprocess.Popen(
    ["streamlit", "run", "climate_app.py", "--server.port=8501", "--server.address=0.0.0.0"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)

# 3Ô∏è‚É£ Attend que le port 8501 soit vraiment √©cout√©
def wait_for_port(port=8501, timeout=10):
    for _ in range(timeout):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            if s.connect_ex(("localhost", port)) == 0:
                return True
        time.sleep(1)
    return False

if wait_for_port():
    # 4Ô∏è‚É£ Reconnecte ngrok
    ngrok.set_auth_token("30Nciu2LDo3NzmKva2zibt2sCFL_7Ag5r9kUYyBCha12WSZ3")
    public_url = ngrok.connect(8501)
    print("üîó Acc√®s public :", public_url)
else:
    print("‚ùå Streamlit n‚Äôa pas d√©marr√© sur le port 8501")

üîó Acc√®s public : NgrokTunnel: "https://8c4f2b343f54.ngrok-free.app" -> "http://localhost:8501"
