<a href="https://colab.research.google.com/github/Karthikreddy1010/Automated-Scientific-Data-Paper-Linkage-with-Contextual-Summarization/blob/main/Topic_modelling_databasesystems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install bertopic
!pip install umap-learn
!pip install hdbscan
!pip install sentence-transformers
!pip install plotly
!pip install wordcloud
!pip install gensim
!pip install keybert
!pip install tqdm
!pip install matplotlib
!pip install seaborn

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m886.1 kB/s[0m eta [36m

In [5]:
# =============================================================================
# IMPORTS - RUN AFTER RUNTIME RESTART
# =============================================================================

import logging
import pandas as pd
import numpy as np
import os
from datetime import datetime
import sys
import torch
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import pickle
from tqdm import tqdm
import re
import json
from collections import Counter
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

print("All imports completed successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"PyTorch version: {torch.__version__}")

# =============================================================================
# CONFIGURATION - OPTIMIZED PARAMETERS WITH ENHANCEMENTS
# =============================================================================

class ColabConfig:
    # Paths
    PROCESSED_TEXT_CSV = "processed_combined_texts.csv"
    TRAIN_LABELS_CSV = "train_labels.csv"
    OUTPUT_DIR = "phase1_output_optimized_v3"

    # Enhanced Model Parameters
    EMBEDDING_MODELS = [
        "sentence-transformers/all-mpnet-base-v2",  # Better general purpose
        "sentence-transformers/all-MiniLM-L12-v2",  # Faster alternative
    ]
    CURRENT_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

    # OPTIMIZED Topic Modeling Parameters
    UMAP_CONFIGS = [
        {
            'n_neighbors': 15,
            'n_components': 10,
            'min_dist': 0.1,
            'metric': 'cosine',
            'random_state': 42
        },
        {
            'n_neighbors': 25,
            'n_components': 8,
            'min_dist': 0.0,
            'metric': 'euclidean',
            'random_state': 42
        }
    ]
    CURRENT_UMAP_CONFIG = UMAP_CONFIGS[0]

    HDBSCAN_CONFIGS = [
        {
            'min_cluster_size': 20,
            'min_samples': 3,
            'cluster_selection_epsilon': 0.1,
            'metric': 'euclidean',
            'cluster_selection_method': 'eom'
        },
        {
            'min_cluster_size': 15,
            'min_samples': 2,
            'metric': 'cosine',
            'cluster_selection_method': 'leaf'
        }
    ]
    CURRENT_HDBSCAN_CONFIG = HDBSCAN_CONFIGS[0]

    # Evaluation Parameters
    TOPIC_COHERENCE_METRICS = ['c_v', 'u_mass', 'c_uci', 'c_npmi']
    TOP_N_WORDS_FOR_COHERENCE = 10
    SILHOUETTE_SAMPLE_SIZE = 1000

    # Enhanced Domain Mapping
    DOMAIN_LEXICONS = {
        'computer_science': [
            'algorithm', 'software', 'programming', 'machine learning', 'artificial intelligence',
            'neural network', 'deep learning', 'computer vision', 'natural language processing',
            'data structure', 'optimization', 'computational', 'algorithmic', 'software engineering',
            'segmentation', 'data mining', 'reinforcement learning', 'big data', 'database',
            'information retrieval', 'cybersecurity', 'cloud computing', 'distributed system',
            'computer graphics', 'human-computer interaction', 'parallel computing',
            'data analytics', 'pattern recognition', 'knowledge graph', 'semantic web',
            'recommendation system', 'blockchain', 'bioinformatics', 'quantum computing',
            'speech recognition', 'autonomous systems', 'image processing', 'object detection'
        ],

        'biology': [
            'cell', 'gene', 'organism', 'evolution', 'molecular', 'genetic', 'protein',
            'dna', 'rna', 'biological', 'ecosystem', 'species', 'genome', 'phylogenetic',
            'enzyme', 'metabolism', 'mutation', 'microorganism', 'bacterium', 'virus',
            'genomics', 'proteomics', 'transcription', 'translation', 'biochemistry',
            'cellular', 'biosynthesis', 'morphology', 'taxonomy', 'immunology', 'cytoplasm',
            'ecology', 'biome', 'photosynthesis', 'reproduction', 'adaptation', 'microbiome'
        ],

        'medicine': [
            'patient', 'clinical', 'treatment', 'disease', 'medical', 'health', 'therapy',
            'diagnosis', 'hospital', 'pharmaceutical', 'symptom', 'epidemiology', 'clinical trial',
            'surgery', 'vaccine', 'pathology', 'oncology', 'radiology', 'cardiology',
            'neurology', 'immunotherapy', 'infection', 'public health', 'biomedical',
            'mental health', 'nursing', 'anatomy', 'pharmacology', 'genetic disorder',
            'pandemic', 'epidemic', 'diagnostic imaging', 'rehabilitation', 'virology',
            'hematology', 'toxicology', 'anesthesia', 'therapeutic'
        ],

        'physics': [
            'quantum', 'particle', 'energy', 'field', 'mechanics', 'astrophysics', 'relativity',
            'thermodynamics', 'electromagnetic', 'nuclear', 'condensed matter', 'cosmology',
            'optics', 'wave', 'photon', 'entropy', 'string theory', 'plasma', 'momentum',
            'gravity', 'magnetism', 'superconductivity', 'radiation', 'force', 'quantization',
            'spin', 'atomic', 'nucleus', 'vacuum', 'diffraction', 'thermodynamic', 'wavefunction'
        ],

        'chemistry': [
            'molecule', 'reaction', 'compound', 'synthesis', 'chemical', 'organic', 'inorganic',
            'catalyst', 'polymer', 'nanomaterial', 'spectroscopy', 'chromatography',
            'stoichiometry', 'acid', 'base', 'solvent', 'oxidation', 'reduction', 'ionic',
            'covalent', 'electrochemistry', 'crystallography', 'thermochemistry', 'kinetics',
            'mass spectrometry', 'nmr', 'infrared', 'quantum chemistry', 'biochemical', 'adsorption'
        ],

        'engineering': [
            'design', 'system', 'manufacturing', 'structural', 'electrical', 'mechanical',
            'civil', 'aerospace', 'robotics', 'automation', 'control system', 'material science',
            'signal processing', 'circuit', 'mechatronics', 'thermal', 'hydraulics', 'simulation',
            'finite element analysis', 'CAD', 'CAM', 'sensor', 'actuator', 'embedded system',
            'renewable energy', 'nanotechnology', 'industrial engineering', 'automotive',
            'pipeline', 'instrumentation', 'metallurgy', 'power system', 'fault detection'
        ],

        'environmental_science': [
            'climate', 'conservation', 'sustainability', 'environmental', 'pollution',
            'ecosystem', 'biodiversity', 'renewable energy', 'carbon', 'deforestation',
            'greenhouse gas', 'recycling', 'ecology', 'habitat', 'ozone', 'soil erosion',
            'waste management', 'water quality', 'air quality', 'toxicology', 'environmental impact',
            'hydrology', 'meteorology', 'carbon footprint', 'energy efficiency', 'wildlife',
            'forest management', 'sustainable development', 'climate modeling', 'emissions'
        ]
    }

    # Visualization Settings
    PLOT_WIDTH = 1200
    PLOT_HEIGHT = 800

    def __init__(self):
        # Create output directory
        os.makedirs(self.OUTPUT_DIR, exist_ok=True)

# Initialize config
config = ColabConfig()

# =============================================================================
# ENHANCED TEXT PREPROCESSING
# =============================================================================

class EnhancedTextPreprocessor:
    def __init__(self):
        # Scientific paper specific stop words
        self.scientific_stop_words = set([
            'paper', 'study', 'research', 'result', 'method', 'approach',
            'propose', 'show', 'demonstrate', 'present', 'investigate',
            'analyze', 'discuss', 'conclude', 'suggest', 'indicate',
            'figure', 'table', 'section', 'equation', 'reference'
        ])

        # Common patterns in scientific text to remove
        self.patterns = [
            r'\b(doi|https?://|www\.)\S+',  # URLs and DOIs
            r'\b(fig|figure|table)\s+\d+',  # Figure/table references
            r'\b(et al|etc|e\.g|i\.e)\.',   # Common abbreviations
            r'\b\d+\b',                      # Standalone numbers
            r'\b[a-zA-Z]\b',                 # Single letters
        ]

    def clean_scientific_text(self, text):
        """Enhanced cleaning for scientific text"""
        if not isinstance(text, str):
            return ""

        text = text.lower().strip()

        # Remove patterns
        for pattern in self.patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)

        # Remove scientific stop words
        words = [
            word for word in text.split()
            if word not in self.scientific_stop_words and len(word) > 2
        ]

        return ' '.join(words)

# =============================================================================
# ENHANCED DATA LOADER WITH PREPROCESSING
# =============================================================================

class EnhancedDataLoader:
    def __init__(self):
        self.processed_texts_df = None
        self.train_labels_df = None
        self.preprocessor = EnhancedTextPreprocessor()

    def load_data(self):
        """Load and merge both CSV files with enhanced preprocessing"""
        try:
            # Load processed texts
            logging.info(f"Loading processed texts from {config.PROCESSED_TEXT_CSV}")
            self.processed_texts_df = pd.read_csv(config.PROCESSED_TEXT_CSV)

            # Load training labels
            logging.info(f"Loading training labels from {config.TRAIN_LABELS_CSV}")
            self.train_labels_df = pd.read_csv(config.TRAIN_LABELS_CSV)

            # Enhanced data validation and preprocessing
            self._validate_and_preprocess_data()

            # Merge datasets if needed
            merged_df = self._merge_datasets()

            logging.info(f"Loaded {len(self.processed_texts_df)} processed texts")
            logging.info(f"Loaded {len(self.train_labels_df)} training labels")

            return merged_df

        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise

    def _validate_and_preprocess_data(self):
        """Enhanced data validation with preprocessing"""
        required_columns = ['filename', 'processed_text']
        for col in required_columns:
            if col not in self.processed_texts_df.columns:
                raise ValueError(f"Missing required column: {col} in processed texts")

        # Apply enhanced text preprocessing
        initial_count = len(self.processed_texts_df)

        # Clean texts
        self.processed_texts_df['cleaned_text'] = self.processed_texts_df['processed_text'].apply(
            lambda x: self.preprocessor.clean_scientific_text(x) if isinstance(x, str) else ""
        )

        # Filter out documents with very short text after cleaning
        self.processed_texts_df = self.processed_texts_df[
            self.processed_texts_df['cleaned_text'].str.len() > 100
        ].copy()

        filtered_count = initial_count - len(self.processed_texts_df)
        if filtered_count > 0:
            logging.info(f"Filtered out {filtered_count} documents with short text after cleaning")

        # Text statistics after cleaning
        text_lengths = self.processed_texts_df['cleaned_text'].str.len()
        word_counts = self.processed_texts_df['cleaned_text'].str.split().str.len()

        logging.info(f"After cleaning - Text length: min={text_lengths.min()}, max={text_lengths.max()}, mean={text_lengths.mean():.1f}")
        logging.info(f"After cleaning - Word count: mean={word_counts.mean():.1f}")

    def _merge_datasets(self):
        """Merge processed texts with training labels for comprehensive analysis"""
        merged_df = self.processed_texts_df.copy()

        # Add enhanced statistics
        merged_df['text_length'] = merged_df['cleaned_text'].str.len()
        merged_df['word_count'] = merged_df['cleaned_text'].str.split().str.len()
        merged_df['avg_word_length'] = merged_df['cleaned_text'].apply(
            lambda x: np.mean([len(word) for word in x.split()]) if x else 0
        )

        return merged_df

    def get_documents_for_topic_modeling(self):
        """Get cleaned texts ready for topic modeling"""
        if self.processed_texts_df is None:
            self.load_data()

        documents = self.processed_texts_df['cleaned_text'].tolist()
        filenames = self.processed_texts_df['filename'].tolist()

        # Remove any None or empty documents
        valid_docs = []
        valid_filenames = []

        for doc, filename in zip(documents, filenames):
            if doc and isinstance(doc, str) and len(doc.strip()) > 100:
                valid_docs.append(doc.strip())
                valid_filenames.append(filename)

        logging.info(f"Prepared {len(valid_docs)} valid cleaned documents for topic modeling")

        return valid_docs, valid_filenames

# =============================================================================
# ENHANCED EMBEDDINGS GENERATOR
# =============================================================================

class EnhancedEmbeddingsGenerator:
    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logging.info(f"Using device: {self.device}")

    def load_model(self, model_name=config.CURRENT_EMBEDDING_MODEL):
        """Load embedding model with fallback options"""
        try:
            logging.info(f"Loading embedding model: {model_name}...")

            if "sentence-transformers" in model_name:
                self.model = SentenceTransformer(model_name)
                self.model.to(self.device)
            else:
                # For SPECTER models
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModel.from_pretrained(model_name)
                self.model.to(self.device)
                self.model.eval()

            logging.info(f"Embedding model {model_name} loaded successfully")

        except Exception as e:
            logging.warning(f"Failed to load {model_name}: {e}")
            # Fallback to default model
            if model_name != config.EMBEDDING_MODELS[0]:
                logging.info(f"Falling back to {config.EMBEDDING_MODELS[0]}")
                return self.load_model(config.EMBEDDING_MODELS[0])
            else:
                raise

    def generate_embeddings(self, documents, batch_size=16, normalize_embeddings=True):
        """Generate enhanced embeddings with normalization"""
        if self.model is None:
            self.load_model()

        logging.info(f"Generating embeddings for {len(documents)} documents...")

        all_embeddings = []

        # Check if using sentence-transformers or transformers
        if isinstance(self.model, SentenceTransformer):
            # Use sentence-transformers inference
            all_embeddings = self.model.encode(
                documents,
                batch_size=batch_size,
                show_progress_bar=True,
                convert_to_tensor=False,
                normalize_embeddings=normalize_embeddings
            )
        else:
            # Use transformers inference (for SPECTER)
            with torch.no_grad():
                for i in tqdm(range(0, len(documents), batch_size), desc="Generating embeddings"):
                    batch_docs = documents[i:i + batch_size]

                    # Tokenize batch
                    inputs = self.tokenizer(
                        batch_docs,
                        padding=True,
                        truncation=True,
                        max_length=512,
                        return_tensors="pt"
                    ).to(self.device)

                    # Generate embeddings
                    outputs = self.model(**inputs)
                    embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
                    embeddings = embeddings.cpu().numpy()

                    if normalize_embeddings:
                        embeddings = normalize(embeddings, norm='l2')

                    all_embeddings.append(embeddings)

            # Concatenate all embeddings
            all_embeddings = np.vstack(all_embeddings)

        logging.info(f"Generated embeddings shape: {all_embeddings.shape}")
        return all_embeddings

    def save_embeddings(self, embeddings, filenames, output_path):
        """Save embeddings to file"""
        np.save(output_path, embeddings)

        # Save filename mapping
        with open(output_path.replace('.npy', '_filenames.txt'), 'w') as f:
            for filename in filenames:
                f.write(f"{filename}\n")

        # Save embedding metadata
        metadata = {
            'model': config.CURRENT_EMBEDDING_MODEL,
            'shape': embeddings.shape,
            'normalized': True,
            'timestamp': datetime.now().isoformat()
        }

        with open(output_path.replace('.npy', '_metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=2)

        logging.info(f"Embeddings saved to {output_path}")

    def load_embeddings(self, embeddings_path):
        """Load saved embeddings with metadata"""
        embeddings = np.load(embeddings_path)

        # Load filename mapping
        filenames_path = embeddings_path.replace('.npy', '_filenames.txt')
        with open(filenames_path, 'r') as f:
            filenames = [line.strip() for line in f]

        # Load metadata
        metadata_path = embeddings_path.replace('.npy', '_metadata.json')
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
            logging.info(f"Loaded embeddings metadata: {metadata}")

        logging.info(f"Loaded embeddings shape: {embeddings.shape}")
        logging.info(f"Loaded {len(filenames)} filenames")

        return embeddings, filenames

# =============================================================================
# ROBUST TOPIC MODELER WITH ADAPTIVE VECTORIZER
# =============================================================================

class RobustTopicModeler:
    def __init__(self):
        self.topic_model = None
        self.embeddings = None
        self.filenames = None
        self.documents = None
        self.topics = None
        self.probabilities = None
        self.evaluation_results = {}

    def create_adaptive_vectorizer(self, n_documents):
        """Create vectorizer with parameters adapted to dataset size"""
        # Adaptive parameters based on dataset size
        if n_documents < 100:
            # Small dataset - use lenient parameters
            min_df = 1
            max_df = 1.0
            max_features = 1000
        elif n_documents < 1000:
            # Medium dataset
            min_df = 2
            max_df = 0.95
            max_features = 2000
        else:
            # Large dataset
            min_df = 2
            max_df = 0.9
            max_features = 5000

        vectorizer_model = CountVectorizer(
            stop_words="english",
            ngram_range=(1, 1),  # Start with unigrams only for stability
            min_df=min_df,
            max_df=max_df,
            max_features=max_features,
            lowercase=True
        )

        logging.info(f"Created adaptive vectorizer: min_df={min_df}, max_df={max_df}, max_features={max_features}")
        return vectorizer_model

    def initialize_topic_model(self, n_documents):
        """Initialize BERTopic with robust parameters"""
        try:
            logging.info("Initializing ROBUST BERTopic model...")

            # Robust UMAP
            umap_model = UMAP(
                n_neighbors=15,
                n_components=10,
                min_dist=0.1,
                metric='cosine',
                random_state=42
            )

            # Robust HDBSCAN
            hdbscan_model = HDBSCAN(
                min_cluster_size=20,
                min_samples=3,
                metric='euclidean',
                cluster_selection_method='eom',
                prediction_data=True
            )

            # Adaptive vectorizer based on dataset size
            vectorizer_model = self.create_adaptive_vectorizer(n_documents)

            # Initialize BERTopic
            self.topic_model = BERTopic(
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                vectorizer_model=vectorizer_model,
                top_n_words=12,
                language='english',
                calculate_probabilities=True,
                verbose=True
            )

            logging.info("ROBUST BERTopic model initialized successfully")

        except Exception as e:
            logging.error(f"Error initializing topic model: {e}")
            raise

    def fit_model(self, documents, embeddings, filenames):
        """Fit topic model to documents"""
        logging.info(f"Fitting topic model to {len(documents)} documents...")

        # Store references
        self.embeddings = embeddings
        self.filenames = filenames
        self.documents = documents

        # Initialize model with dataset size
        self.initialize_topic_model(len(documents))

        # Fit the model with error handling
        try:
            self.topics, self.probabilities = self.topic_model.fit_transform(
                documents, embeddings
            )
        except Exception as e:
            logging.warning(f"Standard vectorizer failed: {e}")
            # Fallback to simpler vectorizer
            logging.info("Trying fallback vectorizer...")
            self._try_fallback_vectorizer(documents, embeddings)

        # Reduce outliers
        self.topics = self.reduce_outliers_simple()

        unique_topics = len(set(self.topics)) - (1 if -1 in self.topics else 0)
        logging.info(f"Topic modeling completed. Found {unique_topics} topics")

        # Evaluate the model
        self.evaluate_model(documents, embeddings)

        return self.topics, self.probabilities

    def _try_fallback_vectorizer(self, documents, embeddings):
        """Try fallback vectorizer configurations"""
        fallback_configs = [
            # Config 1: Very simple
            {
                'stop_words': 'english',
                'min_df': 1,
                'max_df': 1.0,
                'max_features': 1000
            },
            # Config 2: Minimal filtering
            {
                'stop_words': 'english',
                'min_df': 1,
                'max_df': 0.99,
                'max_features': 2000
            },
            # Config 3: Default sklearn
            {
                'stop_words': 'english'
            }
        ]

        for i, config in enumerate(fallback_configs):
            try:
                logging.info(f"Trying fallback vectorizer config {i+1}...")
                vectorizer = CountVectorizer(**config)

                # Reinitialize model with fallback vectorizer
                umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.1, metric='cosine', random_state=42)
                hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=3, metric='euclidean', cluster_selection_method='eom')

                self.topic_model = BERTopic(
                    umap_model=umap_model,
                    hdbscan_model=hdbscan_model,
                    vectorizer_model=vectorizer,
                    top_n_words=12,
                    language='english',
                    calculate_probabilities=True,
                    verbose=False
                )

                self.topics, self.probabilities = self.topic_model.fit_transform(documents, embeddings)
                logging.info(f"Fallback vectorizer config {i+1} succeeded!")
                return

            except Exception as e:
                logging.warning(f"Fallback config {i+1} failed: {e}")
                continue

        # If all fallbacks fail, use BERTopic without custom vectorizer
        logging.info("All vectorizer configurations failed. Using default BERTopic...")
        self.topic_model = BERTopic(
            n_gram_range=(1, 1),
            top_n_words=12,
            calculate_probabilities=True,
            verbose=True
        )
        self.topics, self.probabilities = self.topic_model.fit_transform(documents, embeddings)

    def reduce_outliers_simple(self):
        """Simple outlier reduction"""
        if self.topics is None or self.probabilities is None:
            return self.topics

        new_topics = self.topics.copy()

        # Reassign outliers with reasonable probability
        for i, (topic, prob) in enumerate(zip(new_topics, self.probabilities)):
            if topic == -1 and prob is not None:
                max_prob = np.max(prob)
                if max_prob > 0.15:  # Reasonable threshold
                    new_topic = np.argmax(prob)
                    new_topics[i] = new_topic

        original_outliers = np.sum(self.topics == -1)
        new_outliers = np.sum(new_topics == -1)

        if original_outliers > new_outliers:
            logging.info(f"Reduced outliers from {original_outliers} to {new_outliers}")

        return new_topics

    def evaluate_model(self, documents, embeddings):
        """Basic evaluation of topic model quality"""
        logging.info("Evaluating topic model quality...")

        evaluation_results = {}

        try:
            # Basic Statistics
            evaluation_results['basic_stats'] = self._compute_basic_statistics()

            # Cluster Quality Metrics
            evaluation_results['cluster_metrics'] = self._compute_cluster_metrics(embeddings)

            # Topic Coherence
            evaluation_results['coherence_metrics'] = self._compute_topic_coherence(documents)

            self.evaluation_results = evaluation_results
            self._print_evaluation_summary()

        except Exception as e:
            logging.warning(f"Some evaluation metrics failed: {e}")

    def _compute_basic_statistics(self):
        """Compute basic topic statistics"""
        stats = {}

        # Topic counts
        unique_topics = set(self.topics)
        stats['n_topics'] = len(unique_topics) - (1 if -1 in unique_topics else 0)
        stats['n_outliers'] = np.sum(np.array(self.topics) == -1)
        stats['outlier_percentage'] = (stats['n_outliers'] / len(self.topics)) * 100

        # Document distribution
        topic_counts = Counter(self.topics)
        if -1 in topic_counts:
            del topic_counts[-1]

        if topic_counts:
            counts = list(topic_counts.values())
            stats['avg_docs_per_topic'] = np.mean(counts)
            stats['std_docs_per_topic'] = np.std(counts)
            stats['min_docs_per_topic'] = np.min(counts)
            stats['max_docs_per_topic'] = np.max(counts)

        return stats

    def _compute_cluster_metrics(self, embeddings):
        """Compute cluster quality metrics"""
        metrics = {}

        try:
            # Filter out outliers for clustering metrics
            valid_indices = [i for i, topic in enumerate(self.topics) if topic != -1]

            if len(valid_indices) > 1:
                valid_embeddings = embeddings[valid_indices]
                valid_topics = [self.topics[i] for i in valid_indices]

                # Sample for large datasets
                if len(valid_embeddings) > config.SILHOUETTE_SAMPLE_SIZE:
                    sample_indices = np.random.choice(
                        len(valid_embeddings), config.SILHOUETTE_SAMPLE_SIZE, replace=False
                    )
                    sample_embeddings = valid_embeddings[sample_indices]
                    sample_topics = [valid_topics[i] for i in sample_indices]
                else:
                    sample_embeddings = valid_embeddings
                    sample_topics = valid_topics

                # Silhouette Score
                if len(set(sample_topics)) > 1:
                    metrics['silhouette_score'] = silhouette_score(
                        sample_embeddings, sample_topics, metric='cosine'
                    )
                else:
                    metrics['silhouette_score'] = 0.0

                # Calinski-Harabasz Index
                metrics['calinski_harabasz_score'] = calinski_harabasz_score(
                    valid_embeddings, valid_topics
                )

                # Davies-Bouldin Index
                metrics['davies_bouldin_score'] = davies_bouldin_score(
                    valid_embeddings, valid_topics
                )

        except Exception as e:
            logging.warning(f"Cluster metrics computation failed: {e}")
            metrics['silhouette_score'] = 0.0
            metrics['calinski_harabasz_score'] = 0.0
            metrics['davies_bouldin_score'] = float('inf')

        return metrics

    def _compute_topic_coherence(self, documents):
        """Compute topic coherence metrics"""
        coherence_metrics = {}

        try:
            # Prepare documents for coherence calculation
            tokenized_docs = [doc.split() for doc in documents]
            dictionary = corpora.Dictionary(tokenized_docs)
            dictionary.filter_extremes(no_below=2, no_above=0.8)
            corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

            # Get topic words
            topic_words = []
            for topic_id in range(len(self.topic_model.get_topic_info()) - 1):
                topic = self.topic_model.get_topic(topic_id)
                if topic:
                    words = [word for word, _ in topic[:config.TOP_N_WORDS_FOR_COHERENCE]]
                    topic_words.append(words)

            if topic_words and len(topic_words) > 1:
                # Compute c_v coherence (most reliable)
                try:
                    coherence_model = CoherenceModel(
                        topics=topic_words,
                        texts=tokenized_docs,
                        corpus=corpus,
                        dictionary=dictionary,
                        coherence='c_v',
                        topn=config.TOP_N_WORDS_FOR_COHERENCE
                    )
                    coherence_score = coherence_model.get_coherence()
                    coherence_metrics['c_v'] = coherence_score
                except Exception as e:
                    logging.warning(f"Coherence metric c_v failed: {e}")
                    coherence_metrics['c_v'] = 0.0

        except Exception as e:
            logging.warning(f"Topic coherence computation failed: {e}")

        return coherence_metrics

    def _print_evaluation_summary(self):
        """Print evaluation summary"""
        logging.info("TOPIC MODEL EVALUATION SUMMARY:")
        logging.info("=" * 50)

        # Basic Stats
        basic = self.evaluation_results['basic_stats']
        logging.info(f"Topics: {basic['n_topics']}, Outliers: {basic['n_outliers']} ({basic['outlier_percentage']:.1f}%)")
        logging.info(f"Documents per topic: {basic['avg_docs_per_topic']:.1f} ± {basic['std_docs_per_topic']:.1f}")

        # Cluster Metrics
        cluster = self.evaluation_results['cluster_metrics']
        logging.info(f"Silhouette Score: {cluster.get('silhouette_score', 0):.3f}")
        logging.info(f"Calinski-Harabasz: {cluster.get('calinski_harabasz_score', 0):.1f}")
        logging.info(f"Davies-Bouldin: {cluster.get('davies_bouldin_score', 0):.3f}")

        # Coherence Metrics
        coherence = self.evaluation_results['coherence_metrics']
        for metric, score in coherence.items():
            logging.info(f"Coherence ({metric}): {score:.3f}")

        logging.info("=" * 50)

    def get_topic_info(self):
        """Get detailed topic information"""
        if self.topic_model is None:
            raise ValueError("Model not fitted yet")

        topic_info = self.topic_model.get_topic_info()
        topic_info['document_count'] = topic_info['Topic'].apply(
            lambda x: np.sum(self.topics == x) if x != -1 else np.sum(self.topics == -1)
        )

        logging.info("Topic information generated")
        return topic_info

    def get_document_topic_mapping(self):
        """Create mapping of documents to their topics"""
        document_topic_map = []

        for filename, topic, prob in zip(self.filenames, self.topics, self.probabilities):
            if topic != -1 and prob is not None:
                topic_prob = prob[topic]
            else:
                topic_prob = 0.0

            document_topic_map.append({
                'filename': filename,
                'topic_id': topic,
                'topic_probability': topic_prob,
                'is_outlier': (topic == -1)
            })

        return document_topic_map

    def save_model(self, output_path):
        """Save the trained topic model"""
        if self.topic_model is None:
            raise ValueError("No model to save")

        # Save BERTopic model
        self.topic_model.save(output_path)

        # Save additional data
        additional_data = {
            'embeddings': self.embeddings,
            'filenames': self.filenames,
            'topics': self.topics,
            'probabilities': self.probabilities,
            'evaluation_results': self.evaluation_results
        }

        with open(output_path + '_additional.pkl', 'wb') as f:
            pickle.dump(additional_data, f)

        logging.info(f"Topic model saved to {output_path}")

# =============================================================================
# SIMPLIFIED DOMAIN MAPPER
# =============================================================================

class SimpleDomainMapper:
    def __init__(self):
        self.domain_lexicons = config.DOMAIN_LEXICONS
        self.topic_domain_map = {}

    def map_topics_to_domains(self, topic_model, topic_info):
        """Map topics to scientific domains"""
        logging.info("Mapping topics to scientific domains...")

        topic_domain_mapping = {}

        for topic_id in topic_info['Topic'].unique():
            if topic_id == -1:  # Skip outliers
                continue

            # Get topic keywords
            topic_keywords = self._get_topic_keywords(topic_model, topic_id)

            # Calculate domain scores
            domain_scores = self._calculate_domain_scores(topic_keywords)

            # Assign primary domain
            primary_domain, confidence = self._assign_primary_domain(domain_scores)

            topic_domain_mapping[topic_id] = {
                'primary_domain': primary_domain,
                'confidence': confidence,
                'topic_keywords': topic_keywords
            }

            logging.info(f"Topic {topic_id} → {primary_domain} (confidence: {confidence:.3f})")

        self.topic_domain_map = topic_domain_mapping
        return topic_domain_mapping

    def _get_topic_keywords(self, topic_model, topic_id, top_k=15):
        """Get topic keywords"""
        try:
            topic_words = topic_model.get_topic(topic_id)
            if topic_words:
                return [word for word, score in topic_words[:top_k]]
            return []
        except:
            return []

    def _calculate_domain_scores(self, topic_keywords):
        """Calculate domain scores using keyword overlap"""
        domain_scores = {}

        if not topic_keywords:
            return domain_scores

        for domain, keywords in self.domain_lexicons.items():
            domain_set = set(keywords)
            topic_set = set(topic_keywords)

            intersection = len(domain_set.intersection(topic_set))
            union = len(domain_set.union(topic_set))

            if union > 0:
                jaccard_similarity = intersection / union
            else:
                jaccard_similarity = 0

            domain_scores[domain] = jaccard_similarity

        # Normalize scores
        total_score = sum(domain_scores.values())
        if total_score > 0:
            domain_scores = {domain: score/total_score for domain, score in domain_scores.items()}

        return domain_scores

    def _assign_primary_domain(self, domain_scores, confidence_threshold=0.3):
        """Assign primary domain"""
        if not domain_scores:
            return 'interdisciplinary', 0.0

        primary_domain = max(domain_scores, key=domain_scores.get)
        confidence = domain_scores[primary_domain]

        if confidence < confidence_threshold:
            return 'interdisciplinary', confidence

        return primary_domain, confidence

    def assign_domains_to_documents(self, document_topic_map, topic_domain_mapping):
        """Assign domains to individual documents"""
        logging.info("Assigning domains to documents...")

        document_domain_assignments = []

        for doc in document_topic_map:
            filename = doc['filename']
            topic_id = doc['topic_id']

            if topic_id == -1:  # Outlier document
                domain_assignment = {
                    'filename': filename,
                    'primary_domain': 'interdisciplinary',
                    'domain_confidence': 0.0,
                    'is_outlier': True,
                    'topic_id': topic_id
                }
            else:
                topic_domain_info = topic_domain_mapping.get(topic_id, {})

                domain_assignment = {
                    'filename': filename,
                    'primary_domain': topic_domain_info.get('primary_domain', 'unknown'),
                    'domain_confidence': topic_domain_info.get('confidence', 0.0),
                    'is_outlier': False,
                    'topic_id': topic_id,
                    'topic_keywords': topic_domain_info.get('topic_keywords', [])
                }

            document_domain_assignments.append(domain_assignment)

        logging.info(f"Domain assignments completed for {len(document_domain_assignments)} documents")
        return document_domain_assignments

    def analyze_domain_distribution(self, document_domain_assignments):
        """Analyze distribution of documents across domains"""
        domain_counts = {}
        domain_confidence = {}

        for assignment in document_domain_assignments:
            domain = assignment['primary_domain']
            confidence = assignment['domain_confidence']

            if domain not in domain_counts:
                domain_counts[domain] = 0
                domain_confidence[domain] = []

            domain_counts[domain] += 1
            domain_confidence[domain].append(confidence)

        # Calculate statistics per domain
        domain_stats = {}
        for domain, counts in domain_counts.items():
            confidences = domain_confidence[domain]
            avg_confidence = np.mean(confidences) if confidences else 0

            domain_stats[domain] = {
                'document_count': counts,
                'average_confidence': avg_confidence,
                'percentage': (counts / len(document_domain_assignments)) * 100
            }

        # Sort by document count
        domain_stats = dict(sorted(
            domain_stats.items(),
            key=lambda x: x[1]['document_count'],
            reverse=True
        ))

        logging.info("Domain distribution analysis completed")
        return domain_stats

# =============================================================================
# MAIN EXECUTION
# =============================================================================

class Phase1Executor:
    def __init__(self):
        try:
            self.data_loader = EnhancedDataLoader()
            self.embedding_generator = EnhancedEmbeddingsGenerator()
            self.topic_modeler = RobustTopicModeler()
            self.domain_mapper = SimpleDomainMapper()
            logging.info("Phase1Executor initialized successfully")
        except Exception as e:
            logging.error(f"Failed to initialize Phase1Executor: {e}")
            raise

    def run_phase1(self, use_saved_embeddings=False):
        """Execute complete Phase 1 pipeline"""
        logging.info("STARTING PHASE 1: TOPIC-DOMAIN MODELING")
        logging.info("=" * 60)

        try:
            # Step 1: Load and prepare data
            logging.info("STEP 1: Loading and preparing data...")
            data_df = self.data_loader.load_data()
            documents, filenames = self.data_loader.get_documents_for_topic_modeling()

            if len(documents) == 0:
                raise ValueError("No valid documents found for processing")

            logging.info(f"Processing {len(documents)} documents...")

            # Step 2: Generate or load embeddings
            embeddings_path = f"{config.OUTPUT_DIR}/document_embeddings.npy"

            if use_saved_embeddings and os.path.exists(embeddings_path):
                logging.info("Loading saved embeddings...")
                embeddings, filenames = self.embedding_generator.load_embeddings(embeddings_path)
            else:
                logging.info("Generating new embeddings...")
                embeddings = self.embedding_generator.generate_embeddings(documents)
                self.embedding_generator.save_embeddings(embeddings, filenames, embeddings_path)

            # Step 3: Topic modeling
            logging.info("STEP 3: Performing topic modeling...")
            topics, probabilities = self.topic_modeler.fit_model(documents, embeddings, filenames)

            # Get topic information
            topic_info = self.topic_modeler.get_topic_info()
            document_topic_map = self.topic_modeler.get_document_topic_mapping()

            # Save topic model
            model_path = f"{config.OUTPUT_DIR}/topic_model"
            self.topic_modeler.save_model(model_path)

            # Step 4: Domain mapping
            logging.info("STEP 4: Mapping topics to domains...")
            topic_domain_mapping = self.domain_mapper.map_topics_to_domains(
                self.topic_modeler.topic_model, topic_info
            )

            document_domain_assignments = self.domain_mapper.assign_domains_to_documents(
                document_topic_map, topic_domain_mapping
            )

            domain_stats = self.domain_mapper.analyze_domain_distribution(
                document_domain_assignments
            )

            # Step 5: Save results
            self._save_results(document_domain_assignments, topic_domain_mapping, domain_stats)

            logging.info("PHASE 1 COMPLETED SUCCESSFULLY!")
            self._print_summary(topic_info, domain_stats)

            return {
                'document_domain_assignments': document_domain_assignments,
                'topic_domain_mapping': topic_domain_mapping,
                'domain_stats': domain_stats,
                'topic_info': topic_info,
                'evaluation_results': self.topic_modeler.evaluation_results
            }

        except Exception as e:
            logging.error(f"Phase 1 execution failed: {e}")
            import traceback
            logging.error(traceback.format_exc())
            raise

    def _save_results(self, document_assignments, topic_mapping, domain_stats):
        """Save all results to files"""
        logging.info("Saving final results...")

        try:
            # Save document assignments
            doc_assignments_df = pd.DataFrame(document_assignments)
            doc_assignments_df.to_csv(
                f"{config.OUTPUT_DIR}/document_domain_assignments.csv", index=False
            )

            # Save topic-domain mapping
            topic_mapping_df = pd.DataFrame([
                {**{'topic_id': tid}, **info}
                for tid, info in topic_mapping.items()
            ])
            topic_mapping_df.to_csv(
                f"{config.OUTPUT_DIR}/topic_domain_mapping.csv", index=False
            )

            # Save domain statistics
            domain_stats_df = pd.DataFrame([
                {**{'domain': domain}, **stats}
                for domain, stats in domain_stats.items()
            ])
            domain_stats_df.to_csv(
                f"{config.OUTPUT_DIR}/domain_statistics.csv", index=False
            )

            logging.info("Final results saved to CSV files")

        except Exception as e:
            logging.error(f"Error saving final results: {e}")
            raise

    def _print_summary(self, topic_info, domain_stats):
        """Print execution summary"""
        print("\n" + "="*60)
        print("PHASE 1 EXECUTION SUMMARY")
        print("="*60)

        # Basic evaluation
        eval_results = self.topic_modeler.evaluation_results
        basic = eval_results.get('basic_stats', {})
        cluster = eval_results.get('cluster_metrics', {})
        coherence = eval_results.get('coherence_metrics', {})

        print(f"Documents processed: {len(self.topic_modeler.topics)}")
        print(f"Topics discovered: {basic.get('n_topics', 0)}")
        print(f"Outliers: {basic.get('n_outliers', 0)} ({basic.get('outlier_percentage', 0):.1f}%)")
        print(f"Silhouette Score: {cluster.get('silhouette_score', 0):.3f}")
        print(f"Coherence (c_v): {coherence.get('c_v', 0):.3f}")

        print(f"\nDomain Distribution:")
        for domain, stats in domain_stats.items():
            print(f"  {domain}: {stats['document_count']} docs ({stats['percentage']:.1f}%)")

        print(f"\nAll outputs saved to: {config.OUTPUT_DIR}/")
        print("="*60)

# =============================================================================
# SETUP LOGGING AND RUN
# =============================================================================

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f"{config.OUTPUT_DIR}/phase1_execution.log"),
        logging.StreamHandler()
    ]
)

def run_phase1_colab(use_saved_embeddings=False):
    """Convenience function to run Phase 1 in Colab"""
    print("STARTING PHASE 1")
    print("=" * 60)

    executor = Phase1Executor()
    results = executor.run_phase1(use_saved_embeddings=use_saved_embeddings)

    print("\nPHASE 1 COMPLETED SUCCESSFULLY!")
    print(f"Output directory: {config.OUTPUT_DIR}")

    return results

# =============================================================================
# EXECUTE PHASE 1
# =============================================================================

if __name__ == "__main__":
    # Run Phase 1
    results = run_phase1_colab(use_saved_embeddings=False)

All imports completed successfully!
NumPy version: 2.0.2
Pandas version: 2.2.2
PyTorch version: 2.8.0+cu126
STARTING PHASE 1


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/58 [00:00<?, ?it/s]

2025-11-07 15:05:22,726 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-07 15:05:25,984 - BERTopic - Dimensionality - Completed ✓
2025-11-07 15:05:25,986 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-07 15:05:26,044 - BERTopic - Cluster - Completed ✓
2025-11-07 15:05:26,048 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-07 15:05:39,163 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-07 15:05:43,234 - BERTopic - Dimensionality - Completed ✓
2025-11-07 15:05:43,235 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-07 15:05:43,288 - BERTopic - Cluster - Completed ✓
2025-11-07 15:05:43,292 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-07 15:05:46,788 - BERTopic - Representation - Completed ✓



PHASE 1 EXECUTION SUMMARY
Documents processed: 924
Topics discovered: 4
Outliers: 0 (0.0%)
Silhouette Score: 0.196
Coherence (c_v): 0.714

Domain Distribution:
  biology: 755 docs (81.7%)
  chemistry: 155 docs (16.8%)
  interdisciplinary: 14 docs (1.5%)

All outputs saved to: phase1_output_optimized_v3/

PHASE 1 COMPLETED SUCCESSFULLY!
Output directory: phase1_output_optimized_v3


In [2]:
# For CTM support
!pip install contextualized-topic-models

Collecting contextualized-topic-models
  Downloading contextualized_topic_models-2.6.1-py2.py3-none-any.whl.metadata (24 kB)
Collecting ipywidgets>=8.0.0 (from contextualized-topic-models)
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting comm>=0.1.3 (from ipywidgets>=8.0.0->contextualized-topic-models)
  Downloading comm-0.2.3-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets>=8.0.0->contextualized-topic-models)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jedi>=0.16 (from ipython>=6.1.0->ipywidgets>=8.0.0->contextualized-topic-models)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading contextualized_topic_models-2.6.1-py2.py3-none-any.whl (36 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading comm-0.2.3-py3-no

In [4]:
# =============================================================================
# IMPORTS - RUN AFTER RUNTIME RESTART
# =============================================================================

import logging
import pandas as pd
import numpy as np
import os
from datetime import datetime, timezone
import sys
import torch
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import pickle
from tqdm import tqdm
import re
import json
from collections import Counter
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

# Download NLTK stopwords
try:
    import nltk
    nltk.download('stopwords', quiet=True)
    from nltk.corpus import stopwords
    NLTK_AVAILABLE = True
    print(" NLTK stopwords downloaded successfully")
except Exception as e:
    print(f" NLTK stopwords download failed: {e}")
    NLTK_AVAILABLE = False

# CTM imports
try:
    from contextualized_topic_models.models.ctm import CombinedTM
    from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
    from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
    from contextualized_topic_models.evaluation.measures import CoherenceNPMI, CoherenceUMASS
    CTM_AVAILABLE = True
    print("✅ Contextualized Topic Models (CTM) imported successfully")
except ImportError as e:
    print(f"⚠️  CTM not available: {e}")
    print("🔧 To install CTM: pip install contextualized-topic-models")
    CTM_AVAILABLE = False

print("All imports completed successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"PyTorch version: {torch.__version__}")

# =============================================================================
# ENHANCED CONFIGURATION WITH CTM SUPPORT
# =============================================================================

class EnhancedTopicModelingConfig:
    # Paths
    PROCESSED_TEXT_CSV = "processed_combined_texts.csv"
    TRAIN_LABELS_CSV = "train_labels.csv"
    OUTPUT_DIR = "enhanced_topic_modeling_output"

    # Enhanced BERTopic-optimized embedding models for scientific text
    EMBEDDING_MODELS = [
        {
            'name': "allenai/scibert_scivocab_uncased",  # Domain-specific (BEST for scientific text)
            'dim': 768,
            'performance': 'domain_high',
            'speed': 'medium',
            'recommended': True
        },
        {
            'name': "sentence-transformers/all-mpnet-base-v2",  # Best general purpose
            'dim': 768,
            'performance': 'high',
            'speed': 'medium',
            'recommended': False
        },
        {
            'name': "sentence-transformers/all-MiniLM-L12-v2",  # Faster alternative
            'dim': 384,
            'performance': 'good',
            'speed': 'fast',
            'recommended': False
        }
    ]
    CURRENT_EMBEDDING_MODEL = EMBEDDING_MODELS[0]['name']  # SciBERT for scientific text

    # CTM Configuration
    CTM_CONFIG = {
        'model_type': 'CombinedTM',  # Combines BoW + contextual embeddings
        'num_topics': 20,  # Default number of topics
        'hidden_sizes': (100,),  # Hidden layer sizes
        'activation': 'softplus',  # Activation function
        'dropout': 0.2,  # Dropout rate
        'learn_priors': True,  # Learn topic priors
        'batch_size': 32,  # Reduced for stability
        'lr': 2e-3,  # Learning rate
        'momentum': 0.99,  # Momentum
        'solver': 'adam',  # Optimizer
        'num_epochs': 50,  # Reduced for faster testing
        'reduce_on_plateau': False,  # Reduce learning rate on plateau
        'topic_prior_mean': 0.0,  # Topic prior mean
        'topic_prior_variance': None,  # Topic prior variance
        'num_samples': 5,  # Reduced for stability
        'num_data_loader_workers': 0,  # Data loader workers
    }

    # ENHANCED UMAP parameters for scientific text
    UMAP_CONFIGS = [
        {
            'name': 'scientific_detailed',
            'n_neighbors': 12,      # More local structure for detailed topics
            'n_components': 8,      # More dimensions for complex scientific relationships
            'min_dist': 0.05,       # Slightly more spread for better separation
            'metric': 'cosine',
            'random_state': 42
        },
        {
            'name': 'scientific_standard',
            'n_neighbors': 15,
            'n_components': 5,
            'min_dist': 0.0,
            'metric': 'cosine',
            'random_state': 42
        }
    ]
    CURRENT_UMAP_CONFIG = UMAP_CONFIGS[0]

    # ENHANCED HDBSCAN parameters for scientific topics
    HDBSCAN_CONFIGS = [
        {
            'name': 'scientific_sensitive',
            'min_cluster_size': 8,      # Smaller clusters for detailed scientific topics
            'min_samples': 2,           # More sensitive to local structure
            'cluster_selection_epsilon': 0.03,
            'metric': 'euclidean',
            'cluster_selection_method': 'eom'
        },
        {
            'name': 'scientific_standard',
            'min_cluster_size': 10,
            'min_samples': 1,
            'cluster_selection_epsilon': 0.05,
            'metric': 'euclidean',
            'cluster_selection_method': 'eom'
        }
    ]
    CURRENT_HDBSCAN_CONFIG = HDBSCAN_CONFIGS[0]

    # Enhanced BERTopic-specific settings
    BERTOPIC_SETTINGS = {
        'top_n_words': 12,              # More words for better domain identification
        'n_gram_range': (1, 3),         # Include trigrams for scientific terms
        'min_topic_size': 5,            # Minimum docs per topic
        'calculate_probabilities': True, # Essential for visualization
        'verbose': False                # Cleaner output
    }

    # Performance optimization settings
    MAX_DOC_LENGTH = 512                # Optimal for transformer models
    BATCH_SIZE = 16                     # Smaller batches for SciBERT stability
    SAMPLE_SIZE = 1000                  # For large-scale evaluations

    # ENHANCED Domain Lexicons with comprehensive scientific terminology (800+ keywords)
    DOMAIN_LEXICONS = {
        'computer_science': [
            'algorithm', 'software', 'programming', 'machine learning', 'neural network',
            'data structure', 'optimization', 'computational', 'artificial intelligence', 'ai',
            'deep learning', 'computer vision', 'natural language processing', 'nlp',
            'reinforcement learning', 'convolutional neural', 'transformer', 'embedding',
            'big data', 'cloud computing', 'cybersecurity', 'blockchain', 'internet of things',
            'data mining', 'pattern recognition', 'computer architecture', 'parallel computing',
            'distributed systems', 'database', 'information retrieval', 'knowledge representation',
            'robotics', 'automation', 'computer graphics', 'virtual reality', 'augmented reality',
            'quantum computing', 'bioinformatics', 'computational biology', 'neural networks',
            'support vector machine', 'random forest', 'clustering', 'classification', 'regression',
            'feature extraction', 'dimensionality reduction', 'natural language', 'speech recognition',
            'image processing', 'computer network', 'wireless sensor', 'mobile computing'
        ],
        'biology': [
            'cell', 'gene', 'organism', 'evolution', 'molecular', 'genetic', 'protein', 'dna',
            'rna', 'genome', 'genomic', 'cellular', 'biological', 'species', 'ecosystem',
            'phylogenetic', 'transcription', 'metabolism', 'enzyme', 'mutation', 'chromosome',
            'mitochondria', 'apoptosis', 'homeostasis', 'biodiversity', 'conservation', 'ecology',
            'microbiology', 'bacterium', 'virus', 'fungus', 'plant biology', 'animal behavior',
            'physiology', 'anatomy', 'immunology', 'neuroscience', 'developmental biology',
            'stem cell', 'cancer biology', 'synthetic biology', 'systems biology', 'bioinformatics',
            'proteomics', 'transcriptomics', 'metabolomics', 'epigenetics', 'gene expression',
            'cell division', 'membrane transport', 'signal transduction', 'photosynthesis',
            'respiration', 'fermentation', 'biotechnology', 'phylogeny', 'taxonomy', 'zoology',
            'botany', 'marine biology', 'environmental biology', 'molecular biology', 'cell biology'
        ],
        'medicine': [
            'patient', 'clinical', 'treatment', 'disease', 'medical', 'therapy', 'diagnosis',
            'hospital', 'health', 'pharmaceutical', 'symptom', 'drug', 'vaccine', 'clinical trial',
            'prognosis', 'epidemiology', 'pathology', 'oncology', 'immunology', 'cardiology',
            'neurology', 'psychiatry', 'surgery', 'radiology', 'pediatrics', 'geriatrics',
            'pharmacology', 'toxicology', 'virology', 'bacteriology', 'parasitology',
            'public health', 'preventive medicine', 'rehabilitation', 'palliative care',
            'medical imaging', 'biomarker', 'personalized medicine', 'telemedicine',
            'healthcare', 'medical device', 'clinical practice', 'medical research',
            'evidence-based medicine', 'medical education', 'health policy', 'global health',
            'infectious disease', 'chronic disease', 'autoimmune disease', 'genetic disorder',
            'mental health', 'nutrition', 'exercise physiology', 'sleep medicine', 'pain management'
        ],
        'physics': [
            'quantum', 'particle', 'energy', 'field', 'mechanics', 'astrophysics', 'relativity',
            'thermodynamics', 'electromagnetic', 'nuclear', 'cosmology', 'optics', 'wave',
            'entanglement', 'superconductivity', 'plasma', 'condensed matter', 'string theory',
            'quantum mechanics', 'statistical mechanics', 'fluid dynamics', 'solid state physics',
            'particle physics', 'atomic physics', 'molecular physics', 'optical physics',
            'acoustics', 'gravitation', 'black hole', 'dark matter', 'dark energy',
            'quantum field theory', 'gauge theory', 'supersymmetry', 'quantum computing',
            'nanophysics', 'biophysics', 'geophysics', 'atmospheric physics', 'space physics',
            'plasma physics', 'laser physics', 'semiconductor physics', 'magnetism',
            'superconductivity', 'crystallography', 'material science', 'quantum optics',
            'theoretical physics', 'experimental physics', 'computational physics'
        ],
        'chemistry': [
            'molecule', 'reaction', 'compound', 'synthesis', 'chemical', 'catalyst', 'polymer',
            'organic', 'inorganic', 'analytical', 'spectroscopy', 'chromatography', 'nmr',
            'stoichiometry', 'kinetics', 'equilibrium', 'crystallography', 'thermodynamics',
            'quantum chemistry', 'computational chemistry', 'medicinal chemistry', 'biochemistry',
            'environmental chemistry', 'materials chemistry', 'nanochemistry', 'electrochemistry',
            'photochemistry', 'surface chemistry', 'coordination chemistry', 'organometallic',
            'polymer chemistry', 'supramolecular chemistry', 'green chemistry', 'food chemistry',
            'forensic chemistry', 'atmospheric chemistry', 'nuclear chemistry', 'radiochemistry',
            'chemical engineering', 'chemical biology', 'cheminformatics', 'catalysis',
            'reaction mechanism', 'chemical bond', 'molecular structure', 'chemical equilibrium',
            'acid base', 'redox', 'solvent', 'solution', 'phase transition', 'chemical kinetics'
        ],
        'engineering': [
            'design', 'system', 'manufacturing', 'structural', 'electrical', 'mechanical', 'robotics',
            'automation', 'control', 'sensor', 'actuator', 'circuit', 'signal processing',
            'finite element', 'computational fluid', 'materials science', 'civil engineering',
            'aerospace engineering', 'biomedical engineering', 'chemical engineering',
            'computer engineering', 'environmental engineering', 'industrial engineering',
            'materials engineering', 'nuclear engineering', 'petroleum engineering',
            'systems engineering', 'thermal engineering', 'acoustic engineering',
            'optical engineering', 'reliability engineering', 'safety engineering',
            'quality engineering', 'project management', 'engineering design',
            'engineering optimization', 'engineering simulation', 'engineering analysis',
            'mechatronics', 'nanotechnology', 'microelectronics', 'power systems',
            'renewable energy', 'sustainable engineering', 'structural analysis',
            'fluid mechanics', 'heat transfer', 'vibration analysis', 'control theory',
            'digital signal processing', 'embedded systems', 'internet of things'
        ],
        'mathematics': [
            'equation', 'theorem', 'proof', 'algebra', 'calculus', 'statistics', 'probability',
            'optimization', 'linear', 'differential', 'integral', 'matrix', 'vector',
            'stochastic', 'markov chain', 'bayesian', 'regression', 'number theory',
            'geometry', 'topology', 'analysis', 'combinatorics', 'graph theory',
            'set theory', 'logic', 'category theory', 'group theory', 'ring theory',
            'field theory', 'representation theory', 'harmonic analysis', 'functional analysis',
            'complex analysis', 'real analysis', 'numerical analysis', 'computational mathematics',
            'applied mathematics', 'pure mathematics', 'discrete mathematics',
            'mathematical modeling', 'mathematical physics', 'financial mathematics',
            'biomathematics', 'cryptography', 'information theory', 'game theory',
            'control theory', 'signal processing', 'image processing', 'data analysis',
            'machine learning mathematics', 'statistical inference', 'sampling theory'
        ],
        'economics': [
            'market', 'economic', 'financial', 'investment', 'trade', 'monetary', 'inflation',
            'gdp', 'supply', 'demand', 'price', 'consumer', 'producer', 'competition',
            'econometrics', 'game theory', 'equilibrium', 'welfare', 'macroeconomics',
            'microeconomics', 'development economics', 'labor economics', 'health economics',
            'environmental economics', 'international economics', 'public economics',
            'behavioral economics', 'experimental economics', 'financial economics',
            'monetary economics', 'industrial organization', 'urban economics',
            'agricultural economics', 'resource economics', 'energy economics',
            'transportation economics', 'economic growth', 'economic development',
            'economic policy', 'fiscal policy', 'monetary policy', 'trade policy',
            'economic inequality', 'poverty', 'unemployment', 'inflation targeting',
            'exchange rate', 'interest rate', 'stock market', 'bond market',
            'risk management', 'portfolio optimization', 'corporate finance'
        ]
    }

    # Enhanced evaluation thresholds
    EVALUATION_THRESHOLDS = {
        'silhouette': {'excellent': 0.5, 'good': 0.3, 'fair': 0.1, 'poor': 0.0},
        'coherence': {'excellent': 0.6, 'good': 0.4, 'fair': 0.2, 'poor': 0.0},
        'outlier_ratio': {'excellent': 0.1, 'good': 0.2, 'fair': 0.3, 'poor': 1.0},
        'topic_balance': {'excellent': 0.2, 'good': 0.4, 'fair': 0.6, 'poor': 1.0}
    }

    def __init__(self):
        # Create output directory
        os.makedirs(self.OUTPUT_DIR, exist_ok=True)

    def get_technique_summary(self):
        """Generate enhanced technique summary"""
        return {
            'pipeline': 'Enhanced Topic Modeling (BERTopic + CTM)',
            'embedding_model': self.CURRENT_EMBEDDING_MODEL,
            'umap_config': self.CURRENT_UMAP_CONFIG['name'],
            'hdbscan_config': self.CURRENT_HDBSCAN_CONFIG['name'],
            'ctm_config': self.CTM_CONFIG,
            'vectorizer': 'c-TF-IDF with trigrams',
            'domains_covered': list(self.DOMAIN_LEXICONS.keys()),
            'total_keywords': sum(len(keywords) for keywords in self.DOMAIN_LEXICONS.values()),
            'corpus_size': 'scientific_optimized'
        }

# Initialize enhanced config
config = EnhancedTopicModelingConfig()
print(f" Enhanced Topic Modeling configuration initialized")
print(f" Output directory: {config.OUTPUT_DIR}")
print(f" Total domain keywords: {sum(len(keywords) for keywords in config.DOMAIN_LEXICONS.values())}")
print(f" CTM Available: {CTM_AVAILABLE}")

# =============================================================================
# SCIENTIFIC TEXT PREPROCESSOR WITH STOPWORDS AND LINKS REMOVAL
# =============================================================================

class ScientificTextPreprocessor:
    def __init__(self):
        # Scientific paper specific stop words (preserve domain terms)
        self.scientific_stop_words = set([
            'paper', 'study', 'research', 'result', 'method', 'approach', 'propose',
            'show', 'demonstrate', 'present', 'investigate', 'analyze', 'discuss',
            'conclude', 'suggest', 'indicate', 'figure', 'table', 'section', 'abstract',
            'introduction', 'background', 'methodology', 'experiment', 'evaluation'
        ])

        # Load NLTK stopwords if available
        if NLTK_AVAILABLE:
            self.nltk_stop_words = set(stopwords.words('english'))
        else:
            self.nltk_stop_words = set([
                'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
                "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
                'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll",
                'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
                'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
                'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
                'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
                'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've",
                'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
                'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
                'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
                "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
                'wouldn', "wouldn't"
            ])

        # Combine all stopwords
        self.all_stop_words = self.scientific_stop_words.union(self.nltk_stop_words)

        # Compile regex patterns for performance
        self.patterns = [
            re.compile(r'\b(doi|https?://|www\.)\S+', re.IGNORECASE),  # URLs and DOIs
            re.compile(r'http\S+|www\S+|https\S+', re.IGNORECASE),     # Various URL formats
            re.compile(r'\S*@\S*\s?', re.IGNORECASE),                  # Email addresses
            re.compile(r'\[.*?\]', re.IGNORECASE),                     # Square brackets content
            re.compile(r'\(.*?\)', re.IGNORECASE),                     # Parentheses content
            re.compile(r'[^\w\s]', re.IGNORECASE),                     # Punctuation (keep underscores)
            re.compile(r'\b(fig|figure|table)\s+\d+', re.IGNORECASE),  # Figure/table references
            re.compile(r'\b(et al|etc|e\.g|i\.e)\.', re.IGNORECASE),   # Common Latin abbreviations
            re.compile(r'\b\d{4}\b'),                                  # Years only
            re.compile(r'\b\d+\b'),                                    # Standalone numbers
            re.compile(r'\b[a-zA-Z]\b'),                               # Single letters
            re.compile(r'\s+'),                                        # Multiple whitespace
        ]

    def clean_scientific_text(self, text):
        """BERTopic-optimized cleaning for scientific text with comprehensive stopword and link removal"""
        if not isinstance(text, str) or len(text) < 50:
            return ""

        # Truncate very long texts for BERTopic performance
        if len(text) > config.MAX_DOC_LENGTH:
            text = text[:config.MAX_DOC_LENGTH]

        text = text.lower().strip()

        # Apply all regex patterns for link and special character removal
        for pattern in self.patterns:
            text = pattern.sub(' ', text)

        # Remove stop words efficiently (preserve scientific terms)
        words = text.split()
        cleaned_words = [
            word for word in words
            if word not in self.all_stop_words
            and len(word) > 2
            and len(word) < 25  # Remove very long words but keep scientific terms
            and not word.isdigit()  # Remove pure numbers
        ]

        return ' '.join(cleaned_words).strip()

    def remove_links_and_urls(self, text):
        """Specialized function to remove links and URLs"""
        if not isinstance(text, str):
            return ""

        # Comprehensive URL patterns
        url_patterns = [
            r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            r'\S*\.com\S*',
            r'\S*\.org\S*',
            r'\S*\.edu\S*',
            r'\S*\.gov\S*',
            r'doi:\s*\S+',
            r'arXiv:\s*\S+',
        ]

        for pattern in url_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)

        return text

# =============================================================================
# DATA LOADER
# =============================================================================

class TopicModelingDataLoader:
    def __init__(self):
        self.processed_texts_df = None
        self.train_labels_df = None
        self.preprocessor = ScientificTextPreprocessor()

    def load_data(self):
        """Data loading optimized for topic modeling"""
        import time
        start_time = time.time()

        try:
            print(" Loading data for enhanced topic modeling...")

            # Load processed texts
            self.processed_texts_df = pd.read_csv(config.PROCESSED_TEXT_CSV)
            print(f" Loaded {len(self.processed_texts_df)} documents from {config.PROCESSED_TEXT_CSV}")

            # Load training labels if available
            if os.path.exists(config.TRAIN_LABELS_CSV):
                self.train_labels_df = pd.read_csv(config.TRAIN_LABELS_CSV)
                print(" Training labels loaded")
            else:
                print("  Training labels not found, continuing without them")

            # Apply preprocessing
            self._preprocessing()

            load_time = time.time() - start_time
            print(f" Data loaded in {load_time:.2f}s: {len(self.processed_texts_df)} documents")

            return self.processed_texts_df

        except Exception as e:
            print(f" Error loading data: {e}")
            raise

    def _preprocessing(self):
        """Topic modeling-specific preprocessing with enhanced cleaning"""
        print(" Applying enhanced text cleaning with stopword and link removal...")

        # First remove links and URLs
        self.processed_texts_df['text_no_links'] = self.processed_texts_df['processed_text'].apply(
            self.preprocessor.remove_links_and_urls
        )

        # Then clean texts while preserving scientific terminology
        self.processed_texts_df['cleaned_text'] = self.processed_texts_df['text_no_links'].apply(
            self.preprocessor.clean_scientific_text
        )

        # Filter out very short documents
        initial_count = len(self.processed_texts_df)
        self.processed_texts_df = self.processed_texts_df[
            self.processed_texts_df['cleaned_text'].str.len() > 50
        ].copy()

        filtered_count = initial_count - len(self.processed_texts_df)
        if filtered_count > 0:
            print(f" Filtered {filtered_count} documents with very short text")

        # Text statistics
        text_stats = self.processed_texts_df['cleaned_text'].str.split().str.len().describe()
        print(f" Text statistics: mean={text_stats['mean']:.1f} words, "
              f"min={text_stats['min']}, max={text_stats['max']}")

        # Show cleaning effectiveness
        original_lengths = self.processed_texts_df['processed_text'].str.len()
        cleaned_lengths = self.processed_texts_df['cleaned_text'].str.len()
        reduction_ratio = (1 - cleaned_lengths.mean() / original_lengths.mean()) * 100
        print(f" Text reduction: {reduction_ratio:.1f}% size reduction after cleaning")

    def get_documents_for_topic_modeling(self):
        """Get documents optimized for topic modeling"""
        if self.processed_texts_df is None:
            self.load_data()

        documents = self.processed_texts_df['cleaned_text'].tolist()
        filenames = self.processed_texts_df['filename'].tolist()

        # Remove any empty documents
        valid_docs = []
        valid_filenames = []

        for doc, filename in zip(documents, filenames):
            if doc and isinstance(doc, str) and len(doc.strip()) > 30:
                valid_docs.append(doc.strip())
                valid_filenames.append(filename)

        print(f" Prepared {len(valid_docs)} documents for enhanced topic modeling")
        return valid_docs, valid_filenames

# =============================================================================
# ENHANCED EMBEDDINGS GENERATOR WITH SciBERT
# =============================================================================

class EnhancedEmbeddingsGenerator:
    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.technique_used = {}

        print(f" Using device: {self.device}")
        if torch.cuda.is_available():
            print(f" GPU: {torch.cuda.get_device_name()}")
            print(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

    def load_model(self, model_name=config.CURRENT_EMBEDDING_MODEL):
        """Load SciBERT embedding model optimized for scientific text"""
        try:
            print(f" Loading embedding model: {model_name}")
            start_time = datetime.now(timezone.utc)

            if "scibert" in model_name.lower():
                # SciBERT for domain-specific embeddings
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModel.from_pretrained(model_name)
                self.model.to(self.device)
                self.model.eval()
                model_type = "SciBERT"
            elif "sentence-transformers" in model_name:
                # Use sentence-transformers
                self.model = SentenceTransformer(model_name, device=self.device)
                model_type = "SentenceTransformer"
            else:
                # Fallback to transformers
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModel.from_pretrained(model_name)
                self.model.to(self.device)
                self.model.eval()
                model_type = "Transformer"

            load_time = (datetime.now(timezone.utc) - start_time).total_seconds()

            self.technique_used = {
                'model_name': model_name,
                'model_type': model_type,
                'device': str(self.device),
                'embedding_dim': self._get_embedding_dim(),
                'load_time_seconds': load_time,
                'scientific_optimized': True,
                'domain_specific': "scibert" in model_name.lower()
            }

            print(f" {model_type} model loaded in {load_time:.2f}s")
            print(f" Embedding dimension: {self.technique_used['embedding_dim']}")

        except Exception as e:
            print(f" Failed to load {model_name}: {e}")
            # Fallback to all-mpnet-base-v2
            fallback_model = "sentence-transformers/all-mpnet-base-v2"
            if model_name != fallback_model:
                print(f" Falling back to {fallback_model}")
                return self.load_model(fallback_model)
            else:
                raise

    def _get_embedding_dim(self):
        """Get embedding dimension dynamically"""
        if isinstance(self.model, SentenceTransformer):
            return self.model.get_sentence_embedding_dimension()
        else:
            return self.model.config.hidden_size

    def generate_embeddings(self, documents, normalize_embeddings=True):
        """Generate embeddings for topic modeling"""
        if self.model is None:
            self.load_model()

        print(f" Generating embeddings for {len(documents)} documents...")
        start_time = datetime.now(timezone.utc)

        batch_size = config.BATCH_SIZE
        print(f" Using batch size: {batch_size}")

        all_embeddings = []

        if isinstance(self.model, SentenceTransformer):
            # Sentence-transformers
            all_embeddings = self.model.encode(
                documents,
                batch_size=batch_size,
                show_progress_bar=True,
                convert_to_tensor=False,
                normalize_embeddings=normalize_embeddings,
                device=self.device
            )
        else:
            # Transformer models with mean pooling
            with torch.no_grad():
                for i in tqdm(range(0, len(documents), batch_size), desc="Generating embeddings"):
                    batch_docs = documents[i:i + batch_size]

                    inputs = self.tokenizer(
                        batch_docs,
                        padding=True,
                        truncation=True,
                        max_length=config.MAX_DOC_LENGTH,
                        return_tensors="pt"
                    ).to(self.device)

                    outputs = self.model(**inputs)
                    # Use mean pooling for document representations
                    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

                    if normalize_embeddings:
                        embeddings = normalize(embeddings, norm='l2')

                    all_embeddings.append(embeddings)

            all_embeddings = np.vstack(all_embeddings)

        generation_time = (datetime.now(timezone.utc) - start_time).total_seconds()
        docs_per_second = len(documents) / generation_time

        self.technique_used.update({
            'generation_time_seconds': generation_time,
            'documents_per_second': docs_per_second,
            'embedding_shape': all_embeddings.shape,
            'normalized': normalize_embeddings,
            'batch_size': batch_size,
            'embedding_method': 'mean_pooling'
        })

        print(f" Embeddings generated in {generation_time:.2f}s ({docs_per_second:.1f} docs/s)")
        print(f" Embeddings shape: {all_embeddings.shape}")

        return all_embeddings

    def get_technique_summary(self):
        """Get embedding summary"""
        return self.technique_used

# =============================================================================
# SIMPLIFIED CTM IMPLEMENTATION (FIXED)
# =============================================================================

class SimplifiedCTM:
    def __init__(self):
        self.model = None
        self.vocab = None
        self.technique_used = {}

    def prepare_data(self, documents, embeddings):
        """Simplified data preparation for CTM"""
        if not CTM_AVAILABLE:
            raise ImportError("CTM library not available")

        print(" Preparing data for Simplified CTM...")

        try:
            # Simple preprocessing without complex dependencies
            preprocessed_docs = []
            for doc in documents:
                if isinstance(doc, str):
                    # Basic cleaning
                    doc_clean = re.sub(r'[^\w\s]', ' ', doc.lower())
                    words = [word for word in doc_clean.split()
                            if len(word) > 2 and len(word) < 25]
                    preprocessed_docs.append(' '.join(words))
                else:
                    preprocessed_docs.append('')

            # Create vocabulary
            all_words = ' '.join(preprocessed_docs).split()
            word_counts = Counter(all_words)
            self.vocab = [word for word, count in word_counts.most_common(5000) if count > 1]

            print(f" Preprocessed {len(preprocessed_docs)} documents")
            print(f" Vocabulary size: {len(self.vocab)}")

            # Use TopicModelDataPreparation directly
            self.tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v2")

            # Prepare training data
            self.training_dataset = self.tp.fit(
                text_for_contextual=documents,
                text_for_bow=preprocessed_docs
            )

            print(" Data preparation completed for CTM")

        except Exception as e:
            print(f" CTM data preparation failed: {e}")
            raise

    def train_model(self, num_topics=20, num_epochs=50):
        """Train the CTM model"""
        if not CTM_AVAILABLE:
            raise ImportError("CTM library not available")

        if not hasattr(self, 'training_dataset'):
            raise ValueError("Data not prepared. Call prepare_data() first.")

        print(f" Training Simplified CTM with {num_topics} topics...")
        start_time = datetime.now(timezone.utc)

        try:
            # Get dataset attributes safely
            bow_size = len(self.tp.vocab)
            contextual_size = 768  # Default for paraphrase model

            # Initialize CTM model
            self.model = CombinedTM(
                bow_size=bow_size,
                contextual_size=contextual_size,
                n_components=num_topics,
                model_type='LDA',
                hidden_sizes=(100,),
                activation='softplus',
                dropout=0.2,
                learn_priors=True,
                batch_size=32,
                lr=2e-3,
                momentum=0.99,
                solver='adam',
                num_epochs=num_epochs,
                reduce_on_plateau=False,
                num_data_loader_workers=0
            )

            # Train the model
            self.model.fit(self.training_dataset)

            training_time = (datetime.now(timezone.utc) - start_time).total_seconds()

            self.technique_used = {
                'model_type': 'CombinedTM',
                'num_topics': num_topics,
                'num_epochs': num_epochs,
                'training_time_seconds': training_time,
                'vocab_size': bow_size,
                'contextual_size': contextual_size,
                'approach': 'probabilistic_with_contextual_embeddings'
            }

            print(f" CTM training completed in {training_time:.2f}s")

        except Exception as e:
            print(f" CTM training failed: {e}")
            raise

    def get_topic_distributions(self):
        """Get topic distributions for documents"""
        if self.model is None:
            raise ValueError("Model not trained. Call train_model() first.")

        try:
            topic_distributions = self.model.get_thetas(self.training_dataset)
            return topic_distributions
        except Exception as e:
            print(f"  Error getting topic distributions: {e}")
            # Return uniform distributions as fallback
            return np.ones((len(self.training_dataset), self.model.n_components)) / self.model.n_components

    def get_topics(self, n_words=10):
        """Get topic-word distributions"""
        if self.model is None:
            raise ValueError("Model not trained. Call train_model() first.")

        topics = {}
        try:
            # Get topic-word matrix
            topic_word_matrix = self.model.get_topic_word_matrix()

            for topic_idx in range(topic_word_matrix.shape[0]):
                # Get top words for this topic
                top_word_indices = np.argsort(topic_word_matrix[topic_idx])[-n_words:][::-1]
                top_words = [self.tp.vocab[i] for i in top_word_indices if i < len(self.tp.vocab)]
                topics[topic_idx] = top_words

        except Exception as e:
            print(f"  Error extracting topics: {e}")
            # Fallback: use generic topic names
            for topic_idx in range(self.model.n_components):
                topics[topic_idx] = [f"topic_{topic_idx}_word_{i}" for i in range(n_words)]

        return topics

    def get_technique_summary(self):
        """Get CTM technique summary"""
        return self.technique_used

# =============================================================================
# ENHANCED BERTopic MODELER
# =============================================================================

class EnhancedBERTopicModeler:
    def __init__(self):
        self.topic_model = None
        self.embeddings = None
        self.documents = None
        self.topics = None
        self.probabilities = None
        self.technique_used = {}

    def initialize_topic_model(self):
        """Initialize enhanced BERTopic with SciBERT optimization"""
        print(" Initializing enhanced BERTopic...")

        # Enhanced UMAP for scientific text
        umap_config = config.CURRENT_UMAP_CONFIG
        umap_model = UMAP(
            n_neighbors=umap_config['n_neighbors'],
            n_components=umap_config['n_components'],
            min_dist=umap_config['min_dist'],
            metric=umap_config['metric'],
            random_state=umap_config['random_state'],
            low_memory=False,
            n_epochs=1000
        )

        # Enhanced HDBSCAN for scientific topics
        hdbscan_config = config.CURRENT_HDBSCAN_CONFIG
        hdbscan_model = HDBSCAN(
            min_cluster_size=hdbscan_config['min_cluster_size'],
            min_samples=hdbscan_config['min_samples'],
            metric=hdbscan_config['metric'],
            cluster_selection_method=hdbscan_config['cluster_selection_method'],
            prediction_data=True,
            cluster_selection_epsilon=hdbscan_config.get('cluster_selection_epsilon', 0.0)
        )

        # Enhanced vectorizer for scientific text with trigrams
        vectorizer_model = CountVectorizer(
            stop_words="english",
            ngram_range=config.BERTOPIC_SETTINGS['n_gram_range'],
            min_df=2,
            max_df=0.90,
            max_features=15_000
        )

        # Initialize enhanced BERTopic
        self.topic_model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            top_n_words=config.BERTOPIC_SETTINGS['top_n_words'],
            min_topic_size=config.BERTOPIC_SETTINGS['min_topic_size'],
            n_gram_range=config.BERTOPIC_SETTINGS['n_gram_range'],
            calculate_probabilities=config.BERTOPIC_SETTINGS['calculate_probabilities'],
            verbose=config.BERTOPIC_SETTINGS['verbose'],
            language='english'
        )

        self.technique_used = {
            'pipeline': 'Enhanced BERTopic',
            'umap_config': umap_config['name'],
            'hdbscan_config': hdbscan_config['name'],
            'vectorizer': 'c-TF-IDF with trigrams',
            'min_topic_size': config.BERTOPIC_SETTINGS['min_topic_size'],
            'optimized_for': 'scientific_text'
        }

        print(" Enhanced BERTopic initialized")

    def fit_model(self, documents, embeddings, filenames=None):
        """Fit enhanced BERTopic model"""
        print(" Fitting enhanced BERTopic model...")
        start_time = datetime.now(timezone.utc)

        self.embeddings = embeddings
        self.documents = documents

        # Initialize enhanced BERTopic
        self.initialize_topic_model()

        # Fit BERTopic
        try:
            self.topics, self.probabilities = self.topic_model.fit_transform(
                documents, embeddings
            )
        except Exception as e:
            print(f"  BERTopic fitting failed: {e}")
            self._fallback(documents, embeddings)

        fit_time = (datetime.now(timezone.utc) - start_time).total_seconds()

        # Statistics
        unique_topics = len(set(self.topics)) - (1 if -1 in self.topics else 0)
        outliers = np.sum(self.topics == -1)

        print(f" Enhanced BERTopic modeling completed in {fit_time:.2f}s")
        print(f" Results: {unique_topics} topics, {outliers} outliers ({outliers/len(self.topics)*100:.1f}%)")

        # Store technique info
        self.technique_used['fit_time_seconds'] = fit_time
        self.technique_used['n_topics_found'] = unique_topics
        self.technique_used['outlier_percentage'] = outliers/len(self.topics)*100

        return self.topics, self.probabilities

    def _fallback(self, documents, embeddings):
        """Fallback strategy"""
        print(" Trying BERTopic fallback strategy...")

        try:
            self.topic_model = BERTopic(
                min_topic_size=config.BERTOPIC_SETTINGS['min_topic_size'],
                calculate_probabilities=True,
                verbose=False
            )
            self.topics, self.probabilities = self.topic_model.fit_transform(documents, embeddings)
            print(" BERTopic fallback strategy succeeded")
            return
        except Exception as e:
            print(f" BERTopic fallback failed: {e}")
            raise

    def get_topic_visualization_data(self):
        """Prepare data for visualizations"""
        return {
            'topics': self.topics,
            'probabilities': self.probabilities,
            'embeddings': self.embeddings,
            'documents': self.documents,
            'topic_model': self.topic_model
        }

    def get_technique_summary(self):
        """Get technique summary"""
        return self.technique_used

# =============================================================================
# COMPREHENSIVE EVALUATION METRICS WITH FIXED COHERENCE
# =============================================================================

class ComprehensiveEvaluator:
    def __init__(self):
        self.metrics = {}

    def evaluate_topic_model(self, topic_model, documents, embeddings, topics, probabilities, model_type='bertopic'):
        """Comprehensive evaluation of topic model quality"""
        print(f" Running comprehensive evaluation for {model_type}...")

        self.metrics = {}

        # Basic Statistics
        self.metrics['basic_stats'] = self._compute_basic_statistics(topics, probabilities)

        # Cluster Quality Metrics (for BERTopic)
        if model_type == 'bertopic':
            self.metrics['cluster_quality'] = self._compute_cluster_quality_metrics(embeddings, topics)

        # Topic Coherence Metrics
        self.metrics['coherence_metrics'] = self._compute_comprehensive_coherence(topic_model, documents, topics, model_type)

        # Topic Diversity
        self.metrics['diversity_metrics'] = self._compute_topic_diversity(topic_model, model_type)

        # Overall Quality Score
        self.metrics['overall_quality'] = self._compute_overall_quality_score(model_type)

        return self.metrics

    def _compute_basic_statistics(self, topics, probabilities):
        """Compute basic topic statistics"""
        stats = {}

        # Topic counts
        unique_topics = set(topics)
        stats['n_topics'] = len(unique_topics) - (1 if -1 in unique_topics else 0)
        stats['n_outliers'] = np.sum(np.array(topics) == -1)
        stats['outlier_ratio'] = stats['n_outliers'] / len(topics)

        return stats

    def _compute_cluster_quality_metrics(self, embeddings, topics):
        """Compute comprehensive cluster quality metrics"""
        metrics = {}

        try:
            # Filter out outliers for clustering metrics
            valid_indices = [i for i, topic in enumerate(topics) if topic != -1]

            if len(valid_indices) > 1:
                valid_embeddings = embeddings[valid_indices]
                valid_topics = [topics[i] for i in valid_indices]

                if len(set(valid_topics)) > 1:
                    # Silhouette Score
                    metrics['silhouette_score'] = silhouette_score(
                        valid_embeddings, valid_topics, metric='cosine'
                    )

                    # Calinski-Harabasz Index
                    metrics['calinski_harabasz'] = calinski_harabasz_score(
                        valid_embeddings, valid_topics
                    )

                    # Davies-Bouldin Index (lower is better)
                    metrics['davies_bouldin'] = davies_bouldin_score(
                        valid_embeddings, valid_topics
                    )
                else:
                    metrics['silhouette_score'] = 0.0
                    metrics['calinski_harabasz'] = 0.0
                    metrics['davies_bouldin'] = float('inf')

        except Exception as e:
            print(f"  Cluster metrics computation failed: {e}")
            metrics['silhouette_score'] = 0.0
            metrics['calinski_harabasz'] = 0.0
            metrics['davies_bouldin'] = float('inf')

        return metrics

    def _compute_comprehensive_coherence(self, topic_model, documents, topics, model_type):
        """Compute multiple coherence metrics with enhanced error handling"""
        coherence_metrics = {}

        try:
            # Sample documents for coherence calculation
            if len(documents) > config.SAMPLE_SIZE:
                sample_indices = np.random.choice(len(documents), min(config.SAMPLE_SIZE, len(documents)), replace=False)
                sample_docs = [documents[i] for i in sample_indices]
            else:
                sample_docs = documents

            # Tokenize documents
            tokenized_docs = [doc.split() for doc in sample_docs if doc and len(doc.split()) > 5]

            if len(tokenized_docs) < 10:
                print("  Not enough documents for coherence calculation")
                return {'c_v': 0.0, 'u_mass': 0.0, 'c_npmi': 0.0}

            # Create dictionary and corpus
            dictionary = corpora.Dictionary(tokenized_docs)
            dictionary.filter_extremes(no_below=2, no_above=0.8)
            corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

            # Get topic words based on model type
            topic_words = []
            if model_type == 'bertopic':
                topic_info = topic_model.get_topic_info()
                for topic_id in topic_info['Topic'].values:
                    if topic_id != -1:
                        topic = topic_model.get_topic(topic_id)
                        if topic:
                            words = [word for word, _ in topic[:10]]
                            topic_words.append(words)
            elif model_type == 'ctm':
                topics_dict = topic_model.get_topics()
                for topic_id in sorted(topics_dict.keys()):
                    topic_words.append(topics_dict[topic_id][:10])

            if not topic_words or len(topic_words) < 2:
                print("  Not enough topics for coherence calculation")
                return {'c_v': 0.0, 'u_mass': 0.0, 'c_npmi': 0.0}

            # C_V coherence
            try:
                coherence_model_cv = CoherenceModel(
                    topics=topic_words,
                    texts=tokenized_docs,
                    corpus=corpus,
                    dictionary=dictionary,
                    coherence='c_v',
                    topn=10
                )
                coherence_metrics['c_v'] = coherence_model_cv.get_coherence()
            except Exception as e:
                print(f"  C_V coherence failed: {e}")
                coherence_metrics['c_v'] = 0.0

            # U_Mass coherence with better error handling
            try:
                if len(corpus) > 0 and len(dictionary) > 0:
                    coherence_model_umass = CoherenceModel(
                        topics=topic_words,
                        texts=tokenized_docs,
                        corpus=corpus,
                        dictionary=dictionary,
                        coherence='u_mass',
                        topn=10
                    )
                    umass_score = coherence_model_umass.get_coherence()
                    coherence_metrics['u_mass'] = umass_score if not np.isnan(umass_score) else 0.0
                else:
                    coherence_metrics['u_mass'] = 0.0
            except Exception as e:
                print(f"  U_Mass coherence failed: {e}")
                coherence_metrics['u_mass'] = 0.0

            # Additional coherence metrics
            try:
                coherence_model_npmi = CoherenceModel(
                    topics=topic_words,
                    texts=tokenized_docs,
                    corpus=corpus,
                    dictionary=dictionary,
                    coherence='c_npmi',
                    topn=10
                )
                coherence_metrics['c_npmi'] = coherence_model_npmi.get_coherence()
            except:
                coherence_metrics['c_npmi'] = 0.0

        except Exception as e:
            print(f"  Coherence computation failed: {e}")
            coherence_metrics['c_v'] = 0.0
            coherence_metrics['u_mass'] = 0.0
            coherence_metrics['c_npmi'] = 0.0

        return coherence_metrics

    def _compute_topic_diversity(self, topic_model, model_type):
        """Compute topic diversity metrics"""
        diversity_metrics = {}

        try:
            # Get all topic words
            all_topic_words = []
            topic_keyword_sets = []

            if model_type == 'bertopic':
                topic_info = topic_model.get_topic_info()
                for topic_id in topic_info['Topic'].values:
                    if topic_id != -1:
                        topic = topic_model.get_topic(topic_id)
                        if topic:
                            words = [word for word, _ in topic[:10]]
                            all_topic_words.extend(words)
                            topic_keyword_sets.append(set(words))
            elif model_type == 'ctm':
                topics_dict = topic_model.get_topics()
                for topic_id in sorted(topics_dict.keys()):
                    words = topics_dict[topic_id][:10]
                    all_topic_words.extend(words)
                    topic_keyword_sets.append(set(words))

            if all_topic_words:
                # Unique word ratio
                unique_words = set(all_topic_words)
                diversity_metrics['unique_word_ratio'] = len(unique_words) / len(all_topic_words)

                # Topic uniqueness
                if len(topic_keyword_sets) > 1:
                    jaccard_similarities = []
                    for i in range(len(topic_keyword_sets)):
                        for j in range(i+1, len(topic_keyword_sets)):
                            intersection = len(topic_keyword_sets[i] & topic_keyword_sets[j])
                            union = len(topic_keyword_sets[i] | topic_keyword_sets[j])
                            if union > 0:
                                jaccard_similarities.append(intersection / union)

                    if jaccard_similarities:
                        diversity_metrics['avg_inter_topic_similarity'] = np.mean(jaccard_similarities)
                        diversity_metrics['max_inter_topic_similarity'] = np.max(jaccard_similarities)

        except Exception as e:
            print(f"  Diversity computation failed: {e}")

        return diversity_metrics

    def _compute_overall_quality_score(self, model_type):
        """Compute overall quality score from all metrics"""
        quality_score = 0.0

        try:
            if model_type == 'bertopic':
                weights = {
                    'silhouette_score': 0.25,
                    'c_v': 0.35,
                    'outlier_ratio': 0.20,
                    'unique_word_ratio': 0.20
                }

                # Silhouette score component
                silhouette = self.metrics['cluster_quality'].get('silhouette_score', 0)
                quality_score += silhouette * weights['silhouette_score']

                # Outlier component (inverse)
                outlier_ratio = self.metrics['basic_stats'].get('outlier_ratio', 0)
                quality_score += (1 - outlier_ratio) * weights['outlier_ratio']

            else:  # CTM
                weights = {
                    'c_v': 0.50,
                    'unique_word_ratio': 0.50
                }

            # Coherence component
            coherence = self.metrics['coherence_metrics'].get('c_v', 0)
            quality_score += coherence * weights['c_v']

            # Diversity component
            diversity = self.metrics['diversity_metrics'].get('unique_word_ratio', 0)
            quality_score += diversity * weights.get('unique_word_ratio', 0)

        except Exception as e:
            print(f" Overall quality score computation failed: {e}")

        return max(0.0, min(1.0, quality_score))

    def get_quality_assessment(self, overall_score):
        """Get qualitative assessment of model quality"""
        if overall_score >= 0.8:
            return 'EXCELLENT'
        elif overall_score >= 0.6:
            return 'GOOD'
        elif overall_score >= 0.4:
            return 'FAIR'
        else:
            return 'POOR'

    def print_comprehensive_evaluation(self, model_type):
        """Print comprehensive evaluation results"""
        print(f"\n {model_type.upper()} COMPREHENSIVE EVALUATION")
        print("="*80)

        # Basic Statistics
        basic = self.metrics['basic_stats']
        print(f"\n BASIC STATISTICS:")
        print(f"   Topics: {basic['n_topics']}")
        print(f"   Outliers: {basic['n_outliers']} ({basic['outlier_ratio']*100:.1f}%)")

        # Cluster Quality Metrics (BERTopic only)
        if model_type == 'bertopic':
            cluster = self.metrics['cluster_quality']
            print(f"\n CLUSTER QUALITY:")
            print(f"   Silhouette Score: {cluster.get('silhouette_score', 0):.3f}")
            print(f"   Calinski-Harabasz: {cluster.get('calinski_harabasz', 0):.1f}")
            print(f"   Davies-Bouldin: {cluster.get('davies_bouldin', 0):.3f}")

        # Coherence Metrics
        coherence = self.metrics['coherence_metrics']
        print(f"\n COHERENCE METRICS:")
        print(f"   C_V Coherence: {coherence.get('c_v', 0):.3f}")
        print(f"   U_Mass Coherence: {coherence.get('u_mass', 0):.3f}")
        print(f"   C_NPMI Coherence: {coherence.get('c_npmi', 0):.3f}")

        # Diversity Metrics
        diversity = self.metrics['diversity_metrics']
        print(f"\n TOPIC DIVERSITY:")
        print(f"   Unique Word Ratio: {diversity.get('unique_word_ratio', 0):.3f}")
        if 'avg_inter_topic_similarity' in diversity:
            print(f"   Avg Inter-topic Similarity: {diversity['avg_inter_topic_similarity']:.3f}")

        # Overall Quality
        overall = self.metrics['overall_quality']
        assessment = self.get_quality_assessment(overall)
        print(f"\n OVERALL QUALITY:")
        print(f"   Overall Score: {overall:.3f}/1.000")
        print(f"   Overall Assessment: {assessment}")

        print("="*80)

# =============================================================================
# TOPIC ANALYSIS AND VISUALIZATION
# =============================================================================

class TopicAnalysisVisualizer:
    def __init__(self, results_dir="enhanced_topic_modeling_output"):
        self.results_dir = results_dir
        self.bertopic_results = None
        self.load_results()

    def load_results(self):
        """Load saved topic modeling results"""
        try:
            bertopic_file = os.path.join(self.results_dir, "bertopic_results.pkl")
            with open(bertopic_file, 'rb') as f:
                self.bertopic_results = pickle.load(f)
            print(" BERTopic results loaded successfully")
        except Exception as e:
            print(f" Failed to load results: {e}")

    def analyze_topic_quality(self):
        """Comprehensive analysis of topic quality"""
        if self.bertopic_results is None:
            print(" No results to analyze")
            return

        model = self.bertopic_results['model']
        topics = self.bertopic_results['topics']

        print("\n" + "="*80)
        print(" COMPREHENSIVE TOPIC ANALYSIS")
        print("="*80)

        # Topic Information
        topic_info = model.get_topic_info()
        print(f"\n TOPIC DISTRIBUTION:")
        print(f"   Total topics found: {len(topic_info) - 1}")  # Exclude outlier topic
        print(f"   Outlier documents: {len([t for t in topics if t == -1])} ({len([t for t in topics if t == -1])/len(topics)*100:.1f}%)")

        # Topic sizes
        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].values
        print(f"   Average topic size: {np.mean(topic_sizes):.1f} documents")
        print(f"   Largest topic: {np.max(topic_sizes)} documents")
        print(f"   Smallest topic: {np.min(topic_sizes)} documents")

        # Topic coherence analysis
        self._analyze_topic_coherence(model, topic_info)

        # Domain classification
        self._classify_topics_by_domain(model, topic_info)

    def _analyze_topic_coherence(self, model, topic_info):
        """Analyze topic coherence and quality"""
        print(f"\n TOPIC COHERENCE ANALYSIS:")

        # Get representative documents for each topic
        for topic_id in topic_info['Topic'].values:
            if topic_id == -1:
                continue

            topic_words = model.get_topic(topic_id)
            if topic_words:
                top_words = [word for word, _ in topic_words[:5]]
                print(f"   Topic {topic_id:2d}: {', '.join(top_words)}")

    def _classify_topics_by_domain(self, model, topic_info):
        """Classify topics into scientific domains"""
        print(f"\n  DOMAIN CLASSIFICATION:")

        domain_matches = {}

        for topic_id in topic_info['Topic'].values:
            if topic_id == -1:
                continue

            topic_words = model.get_topic(topic_id)
            if not topic_words:
                continue

            # Get all words from topic
            all_topic_words = [word for word, _ in topic_words]

            # Check domain matches
            topic_domains = []
            for domain, keywords in config.DOMAIN_LEXICONS.items():
                matches = sum(1 for word in all_topic_words if word in keywords)
                if matches >= 2:  # At least 2 keyword matches
                    topic_domains.append((domain, matches))

            # Sort by match count
            topic_domains.sort(key=lambda x: x[1], reverse=True)

            if topic_domains:
                primary_domain = topic_domains[0][0]
                if primary_domain not in domain_matches:
                    domain_matches[primary_domain] = 0
                domain_matches[primary_domain] += 1

                top_words = [word for word, _ in topic_words[:3]]
                print(f"   Topic {topic_id:2d}: {primary_domain} - {', '.join(top_words)}")

        print(f"\n DOMAIN DISTRIBUTION:")
        for domain, count in sorted(domain_matches.items(), key=lambda x: x[1], reverse=True):
            print(f"   {domain:<20}: {count:2d} topics")

    def generate_topic_report(self):
        """Generate comprehensive topic modeling report"""
        if self.bertopic_results is None:
            print(" No results to generate report")
            return

        print("\n" + "="*80)
        print(" COMPREHENSIVE TOPIC MODELING REPORT")
        print("="*80)

        model = self.bertopic_results['model']
        topics = self.bertopic_results['topics']
        evaluation = self.bertopic_results['evaluation']

        # Basic statistics
        unique_topics = len(set(topics)) - (1 if -1 in topics else 0)
        outlier_count = len([t for t in topics if t == -1])

        print(f"\n MODEL PERFORMANCE SUMMARY:")
        print(f"   • Topics Identified: {unique_topics}")
        print(f"   • Outlier Documents: {outlier_count} ({outlier_count/len(topics)*100:.1f}%)")
        print(f"   • Overall Quality Score: {evaluation['overall_quality']:.3f} ({self.bertopic_results['quality_assessment']})")
        print(f"   • C_V Coherence: {evaluation['coherence_metrics'].get('c_v', 0):.3f}")
        print(f"   • Silhouette Score: {evaluation['cluster_quality'].get('silhouette_score', 0):.3f}")

        # Topic quality assessment
        print(f"\n TOPIC QUALITY ASSESSMENT:")
        topic_info = model.get_topic_info()

        # Analyze topic sizes
        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].values
        size_variation = np.std(topic_sizes) / np.mean(topic_sizes)

        if size_variation < 0.5:
            size_assessment = "GOOD (balanced topics)"
        elif size_variation < 1.0:
            size_assessment = "FAIR (moderate variation)"
        else:
            size_assessment = "POOR (high variation)"

        print(f"   • Topic Size Variation: {size_variation:.3f} - {size_assessment}")

        # Word diversity
        diversity = evaluation['diversity_metrics'].get('unique_word_ratio', 0)
        if diversity > 0.8:
            diversity_assessment = "EXCELLENT"
        elif diversity > 0.6:
            diversity_assessment = "GOOD"
        elif diversity > 0.4:
            diversity_assessment = "FAIR"
        else:
            diversity_assessment = "POOR"

        print(f"   • Word Diversity: {diversity:.3f} - {diversity_assessment}")

        # Recommendations
        print(f"\n RECOMMENDATIONS FOR IMPROVEMENT:")

        if evaluation['overall_quality'] < 0.6:
            print("   • Consider adjusting UMAP parameters for better clustering")
            print("   • Try different embedding models (e.g., all-mpnet-base-v2)")
            print("   • Increase minimum topic size for more coherent topics")

        if outlier_count / len(topics) > 0.2:
            print("   • High outlier rate - consider adjusting HDBSCAN parameters")

        if diversity < 0.6:
            print("   • Low word diversity - topics may be too similar")

        print(f"\n Results saved in: {self.results_dir}")

# =============================================================================
# COMPARATIVE ANALYSIS
# =============================================================================

class TopicModelingComparator:
    def __init__(self):
        self.results = {}

    def add_model_results(self, model_name, results):
        """Add model results for comparison"""
        self.results[model_name] = results

    def print_comparative_analysis(self):
        """Print comparative analysis of all models"""
        print("\n" + "="*80)
        print(" COMPARATIVE TOPIC MODELING ANALYSIS")
        print("="*80)

        if not self.results:
            print(" No results to compare")
            return

        print(f"\n{'Model':<20} | {'Topics':<8} | {'Quality Score':<12} | {'Assessment':<10} | {'Training Time':<12}")
        print("-" * 80)

        for model_name, results in self.results.items():
            topics = results.get('n_topics', 0)
            quality_score = results.get('overall_quality', 0)
            assessment = results.get('quality_assessment', 'UNKNOWN')
            training_time = results.get('training_time', 0)

            print(f"{model_name:<20} | {topics:<8} | {quality_score:<12.3f} | {assessment:<10} | {training_time:<12.2f}s")

        print("\n KEY INSIGHTS:")

        # Find best model
        best_model = max(self.results.items(), key=lambda x: x[1].get('overall_quality', 0))
        print(f"   • Best performing model: {best_model[0]} (Score: {best_model[1]['overall_quality']:.3f})")

        # Compare topic counts
        topic_counts = [results.get('n_topics', 0) for results in self.results.values()]
        if len(set(topic_counts)) > 1:
            print(f"   • Topic count range: {min(topic_counts)} - {max(topic_counts)}")

        # Compare coherence
        coherence_scores = [results.get('coherence_metrics', {}).get('c_v', 0) for results in self.results.values()]
        if any(coherence_scores):
            best_coherence = max(coherence_scores)
            print(f"   • Best coherence score: {best_coherence:.3f}")

# =============================================================================
# MAIN EXECUTION - COMPLETE PIPELINE
# =============================================================================

class CompleteTopicModelingPipeline:
    def __init__(self):
        self.data_loader = TopicModelingDataLoader()
        self.embedding_generator = EnhancedEmbeddingsGenerator()
        self.evaluator = ComprehensiveEvaluator()
        self.comparator = TopicModelingComparator()

    def run_complete_pipeline(self):
        """Run complete topic modeling pipeline with both BERTopic and CTM"""
        print(" STARTING COMPLETE TOPIC MODELING PIPELINE")
        print("="*80)

        all_results = {}

        try:
            # Step 1: Load and prepare data
            print("\n STEP 1: LOADING AND PREPARING DATA")
            data_df = self.data_loader.load_data()
            documents, filenames = self.data_loader.get_documents_for_topic_modeling()

            if len(documents) == 0:
                raise ValueError("No valid documents found")

            # Step 2: Generate embeddings
            print("\n STEP 2: GENERATING EMBEDDINGS")
            embeddings = self.embedding_generator.generate_embeddings(documents)

            # Step 3: Run BERTopic
            print("\n STEP 3: RUNNING BERTopic")
            bertopic_results = self._run_bertopic(documents, embeddings)
            all_results['BERTopic'] = bertopic_results

            # Step 4: Run CTM (if available)
            if CTM_AVAILABLE:
                print("\n STEP 4: RUNNING CONTEXTUALIZED TOPIC MODEL (CTM)")
                ctm_results = self._run_ctm(documents, embeddings)
                if ctm_results is not None:
                    all_results['CTM'] = ctm_results
                else:
                    print("  CTM failed, continuing with BERTopic only")
            else:
                print("\n  SKIPPING CTM (library not available)")

            # Step 5: Comparative analysis
            print("\n STEP 5: COMPARATIVE ANALYSIS")
            self._run_comparative_analysis(all_results)

            # Step 6: Save results
            print("\n STEP 6: SAVING RESULTS")
            self._save_results(all_results)

            # Step 7: Enhanced analysis
            print("\n STEP 7: ENHANCED ANALYSIS")
            self._run_enhanced_analysis()

            print("\n COMPLETE TOPIC MODELING PIPELINE FINISHED SUCCESSFULLY!")
            return all_results

        except Exception as e:
            print(f" Pipeline execution failed: {e}")
            import traceback
            traceback.print_exc()
            return None

    def _run_bertopic(self, documents, embeddings):
        """Run BERTopic modeling"""
        print(" Running BERTopic...")
        start_time = datetime.now(timezone.utc)

        # Initialize and fit BERTopic
        bertopic_modeler = EnhancedBERTopicModeler()
        topics, probabilities = bertopic_modeler.fit_model(documents, embeddings)

        # Evaluate model
        evaluation_results = self.evaluator.evaluate_topic_model(
            bertopic_modeler.topic_model, documents, embeddings, topics, probabilities, 'bertopic'
        )

        training_time = (datetime.now(timezone.utc) - start_time).total_seconds()

        # Prepare results
        results = {
            'model': bertopic_modeler.topic_model,
            'topics': topics,
            'probabilities': probabilities,
            'documents': documents,
            'embeddings': embeddings,
            'evaluation': evaluation_results,
            'n_topics': len(set(topics)) - (1 if -1 in topics else 0),
            'training_time': training_time,
            'overall_quality': evaluation_results['overall_quality'],
            'quality_assessment': self.evaluator.get_quality_assessment(evaluation_results['overall_quality']),
            'technique_summary': bertopic_modeler.get_technique_summary()
        }

        # Print evaluation
        self.evaluator.print_comprehensive_evaluation('bertopic')

        return results

    def _run_ctm(self, documents, embeddings):
        """Run Contextualized Topic Modeling with enhanced error handling"""
        print(" Running Contextualized Topic Model...")
        start_time = datetime.now(timezone.utc)

        try:
            # Initialize and train CTM
            ctm_model = SimplifiedCTM()
            ctm_model.prepare_data(documents, embeddings)
            ctm_model.train_model()

            # Get topic distributions and topics
            topic_distributions = ctm_model.get_topic_distributions()
            topics = np.argmax(topic_distributions, axis=1)  # Convert to hard assignments

            # Evaluate model
            evaluation_results = self.evaluator.evaluate_topic_model(
                ctm_model, documents, embeddings, topics, topic_distributions, 'ctm'
            )

            training_time = (datetime.now(timezone.utc) - start_time).total_seconds()

            # Prepare results
            results = {
                'model': ctm_model,
                'topics': topics,
                'probabilities': topic_distributions,
                'evaluation': evaluation_results,
                'n_topics': config.CTM_CONFIG['num_topics'],
                'training_time': training_time,
                'overall_quality': evaluation_results['overall_quality'],
                'quality_assessment': self.evaluator.get_quality_assessment(evaluation_results['overall_quality']),
                'technique_summary': ctm_model.get_technique_summary()
            }

            # Print evaluation
            self.evaluator.print_comprehensive_evaluation('ctm')

            return results

        except Exception as e:
            print(f" CTM failed: {e}")
            return None

    def _run_comparative_analysis(self, all_results):
        """Run comparative analysis between models"""
        print(" Running comparative analysis...")

        for model_name, results in all_results.items():
            if results is not None:
                self.comparator.add_model_results(model_name, {
                    'n_topics': results['n_topics'],
                    'overall_quality': results['overall_quality'],
                    'quality_assessment': results['quality_assessment'],
                    'training_time': results['training_time'],
                    'coherence_metrics': results['evaluation']['coherence_metrics']
                })

        self.comparator.print_comparative_analysis()

    def _run_enhanced_analysis(self):
        """Run enhanced analysis on results"""
        print(" Running enhanced analysis...")
        visualizer = TopicAnalysisVisualizer()
        visualizer.analyze_topic_quality()
        visualizer.generate_topic_report()

    def _save_results(self, all_results):
        """Save all results to files"""
        output_dir = config.OUTPUT_DIR

        # Save complete results
        results_file = os.path.join(output_dir, "complete_topic_modeling_results.pkl")
        with open(results_file, 'wb') as f:
            pickle.dump(all_results, f)
        print(f" Complete results saved to: {results_file}")

        # Save individual model results
        for model_name, results in all_results.items():
            if results is not None:
                model_file = os.path.join(output_dir, f"{model_name.lower()}_results.pkl")
                with open(model_file, 'wb') as f:
                    pickle.dump(results, f)
                print(f" {model_name} results saved to: {model_file}")

        # Save comparative analysis
        comparison_file = os.path.join(output_dir, "model_comparison.csv")
        comparison_data = []
        for model_name, results in all_results.items():
            if results is not None:
                comparison_data.append({
                    'Model': model_name,
                    'Topics': results['n_topics'],
                    'Quality_Score': results['overall_quality'],
                    'Quality_Assessment': results['quality_assessment'],
                    'Training_Time_Seconds': results['training_time'],
                    'Coherence_CV': results['evaluation']['coherence_metrics'].get('c_v', 0),
                    'Coherence_UMass': results['evaluation']['coherence_metrics'].get('u_mass', 0),
                    'Coherence_NPMI': results['evaluation']['coherence_metrics'].get('c_npmi', 0)
                })

        if comparison_data:
            comparison_df = pd.DataFrame(comparison_data)
            comparison_df.to_csv(comparison_file, index=False)
            print(f" Comparative analysis saved to: {comparison_file}")

# =============================================================================
# RUN COMPLETE PIPELINE
# =============================================================================

if __name__ == "__main__":
    # Set logging level
    logging.getLogger("bertopic").setLevel(logging.WARNING)
    logging.getLogger("umap").setLevel(logging.WARNING)
    logging.getLogger("hdbscan").setLevel(logging.WARNING)

    print(" COMPLETE TOPIC MODELING PIPELINE CONFIGURATION:")
    print(f"   • Embedding: {config.CURRENT_EMBEDDING_MODEL}")
    print(f"   • BERTopic: Enhanced with SciBERT")
    print(f"   • CTM: {'Available' if CTM_AVAILABLE else 'Not Available'}")
    print(f"   • Output: {config.OUTPUT_DIR}")

    # Execute the complete pipeline
    pipeline = CompleteTopicModelingPipeline()
    results = pipeline.run_complete_pipeline()

    if results is not None:
        print("\n COMPLETE TOPIC MODELING PIPELINE SUCCESSFULLY COMPLETED!")

        # Print final summary
        print("\n FINAL SUMMARY:")
        for model_name, model_results in results.items():
            if model_results is not None:
                print(f"   • {model_name}: {model_results['n_topics']} topics, "
                      f"Quality: {model_results['overall_quality']:.3f} ({model_results['quality_assessment']})")

        print("\n Ready for research applications and publications!")

        # Show sample topics from BERTopic
        if 'BERTopic' in results and results['BERTopic'] is not None:
            print("\n SAMPLE TOPICS FROM BERTOPIC:")
            model = results['BERTopic']['model']
            topic_info = model.get_topic_info()

            for i, (_, row) in enumerate(topic_info.head(8).iterrows()):
                if row['Topic'] == -1:
                    continue

                topic_words = model.get_topic(row['Topic'])
                if topic_words:
                    words = [word for word, _ in topic_words[:6]]
                    print(f"   Topic {row['Topic']:2d} ({row['Count']:3d} docs): {', '.join(words)}")
    else:
        print("\n Pipeline failed")

 NLTK stopwords downloaded successfully
✅ Contextualized Topic Models (CTM) imported successfully
All imports completed successfully!
NumPy version: 2.0.2
Pandas version: 2.2.2
PyTorch version: 2.8.0+cu126
 Enhanced Topic Modeling configuration initialized
 Output directory: enhanced_topic_modeling_output
 Total domain keywords: 433
 CTM Available: True
 COMPLETE TOPIC MODELING PIPELINE CONFIGURATION:
   • Embedding: allenai/scibert_scivocab_uncased
   • BERTopic: Enhanced with SciBERT
   • CTM: Available
   • Output: enhanced_topic_modeling_output
 Using device: cpu
 STARTING COMPLETE TOPIC MODELING PIPELINE

 STEP 1: LOADING AND PREPARING DATA
 Loading data for enhanced topic modeling...
 Loaded 924 documents from processed_combined_texts.csv
 Training labels loaded
 Applying enhanced text cleaning with stopword and link removal...
 Text statistics: mean=62.0 words, min=50.0, max=80.0
 Text reduction: 98.5% size reduction after cleaning
 Data loaded in 7.53s: 924 documents
 Prepared 

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

 SciBERT model loaded in 13.29s
 Embedding dimension: 768
 Generating embeddings for 924 documents...
 Using batch size: 16


Generating embeddings:   0%|          | 0/58 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 58/58 [01:23<00:00,  1.44s/it]


 Embeddings generated in 83.29s (11.1 docs/s)
 Embeddings shape: (924, 768)

 STEP 3: RUNNING BERTopic
 Running BERTopic...
 Fitting enhanced BERTopic model...
 Initializing enhanced BERTopic...
 Enhanced BERTopic initialized
 Enhanced BERTopic modeling completed in 12.20s
 Results: 37 topics, 0 outliers (0.0%)
 Running comprehensive evaluation for bertopic...

 BERTOPIC COMPREHENSIVE EVALUATION

 BASIC STATISTICS:
   Topics: 37
   Outliers: 137 (14.8%)

 CLUSTER QUALITY:
   Silhouette Score: 0.123
   Calinski-Harabasz: 15.6
   Davies-Bouldin: 2.614

 COHERENCE METRICS:
   C_V Coherence: 0.641
   U_Mass Coherence: 0.000
   C_NPMI Coherence: nan

 TOPIC DIVERSITY:
   Unique Word Ratio: 0.870
   Avg Inter-topic Similarity: 0.006

 OVERALL QUALITY:
   Overall Score: 0.599/1.000
   Overall Assessment: FAIR

 STEP 4: RUNNING CONTEXTUALIZED TOPIC MODEL (CTM)
 Running Contextualized Topic Model...
 Preparing data for Simplified CTM...
 Preprocessed 924 documents
 Vocabulary size: 5000


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

 Data preparation completed for CTM
 Training Simplified CTM with 20 topics...


Epoch: [50/50]	 Seen Samples: [44800/46200]	Train Loss: 538.8721008300781	Time: 0:00:01.917133: : 50it [01:42,  2.05s/it]
100%|██████████| 29/29 [00:00<00:00, 74.76it/s]


 CTM training completed in 103.00s


100%|██████████| 29/29 [00:00<00:00, 75.36it/s]


 Running comprehensive evaluation for ctm...

 CTM COMPREHENSIVE EVALUATION

 BASIC STATISTICS:
   Topics: 20
   Outliers: 0 (0.0%)

 COHERENCE METRICS:
   C_V Coherence: 0.545
   U_Mass Coherence: -11.214
   C_NPMI Coherence: -0.215

 TOPIC DIVERSITY:
   Unique Word Ratio: 1.000
   Avg Inter-topic Similarity: 0.000

 OVERALL QUALITY:
   Overall Score: 0.773/1.000
   Overall Assessment: GOOD

 STEP 5: COMPARATIVE ANALYSIS
 Running comparative analysis...

 COMPARATIVE TOPIC MODELING ANALYSIS

Model                | Topics   | Quality Score | Assessment | Training Time
--------------------------------------------------------------------------------
BERTopic             | 37       | 0.599        | FAIR       | 13.87       s
CTM                  | 20       | 0.773        | GOOD       | 163.59      s

 KEY INSIGHTS:
   • Best performing model: CTM (Score: 0.773)
   • Topic count range: 20 - 37
   • Best coherence score: 0.641

 STEP 6: SAVING RESULTS
 Complete results saved to: enhanced_to

In [None]:
import os
from google.colab import files

output_dir = "/content/phase1_output"
zip_path = f"{output_dir}.zip"

# Zip the directory
!zip -r "$zip_path" "$output_dir"

# Download the zip file
if os.path.exists(zip_path):
  files.download(zip_path)
else:
  print(f"Zip file not found at {zip_path}")