In [6]:
!pip install bertopic sentence-transformers umap-learn hdbscan plotly



In [7]:
!pip install --upgrade nbformat



In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# BERTopic and related imports
try:
    from bertopic import BERTopic
    from sentence_transformers import SentenceTransformer
    from umap import UMAP
    from hdbscan import HDBSCAN
    from sklearn.feature_extraction.text import CountVectorizer
    BERTOPIC_AVAILABLE = True
except ImportError:
    print("BERTopic dependencies not found. Please install with:")
    print("pip install bertopic sentence-transformers umap-learn hdbscan")
    BERTOPIC_AVAILABLE = False

# Alternative lightweight implementation for environments without BERTopic
import re
from collections import Counter, defaultdict
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity


RuntimeError: Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [None]:
class BERTopicPipeline:
    """
    Comprehensive BERTopic modeling pipeline for analyzing topics in text collections.
    Handles short to medium texts (80-400 characters) effectively.
    """
    
    def __init__(self, embedding_model='all-MiniLM-L6-v2', language='english', random_state=42):
        """
        Initialize BERTopic pipeline.
        
        Args:
            embedding_model: Sentence transformer model name
            language: Language for preprocessing
            random_state: Random state for reproducibility
        """
        self.embedding_model_name = embedding_model
        self.language = language
        self.random_state = random_state
        self.topic_model = None
        self.embeddings = None
        self.texts = None
        self.topics = None
        self.topic_info = None
        
        if BERTOPIC_AVAILABLE:
            # Initialize embedding model
            self.embedding_model = SentenceTransformer(embedding_model)
            
            # Initialize UMAP for dimensionality reduction
            self.umap_model = UMAP(
                n_neighbors=5,          # Increase neighbors
                n_components=5,          # Use more components initially
                min_dist=0.0,
                metric='cosine',
                random_state=42,
                init='random'            # Use random initialization instead of spectral
            )
            
            # Initialize HDBSCAN for clustering
            self.hdbscan_model = HDBSCAN(
                min_cluster_size=5,
                metric='euclidean',
                cluster_selection_method='eom',
                prediction_data=True
            )
        else:
            print("Using lightweight fallback implementation")
    
    def preprocess_texts(self, texts):
        """
        Basic preprocessing for texts.
        """
        processed_texts = []
        for text in texts:
            # Basic cleaning
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            text = re.sub(r'\S+@\S+', '', text)
            text = re.sub(r'[^\w\s]', ' ', text)
            text = re.sub(r'\s+', ' ', text).strip()
            
            if len(text) > 10:  # Keep texts with reasonable length
                processed_texts.append(text)
        
        return processed_texts
    
    def fit_bertopic(self, texts, nr_topics='auto', min_topic_size=3):
        """
        Fit BERTopic model on the provided texts.
        """
        if not BERTOPIC_AVAILABLE:
            return self._fit_fallback(texts, nr_topics)
        
        print("Preprocessing texts...")
        self.texts = self.preprocess_texts(texts)
        print(f"Processing {len(self.texts)} texts...")
        
        # Create embeddings
        print("Creating embeddings...")
        self.embeddings = self.embedding_model.encode(self.texts, show_progress_bar=True)
        
        # Adjust clustering parameters based on text count
        if len(self.texts) < 50:
            self.hdbscan_model.min_cluster_size = max(2, len(self.texts) // 20)
            self.umap_model.n_neighbors = min(10, len(self.texts) // 3)
        
        # Create custom vectorizer for better topic representation
        vectorizer_model = CountVectorizer(
            ngram_range=(1, 2),
            stop_words='english',
            min_df=1,
            max_df=0.9
        )
        
        # Initialize BERTopic with custom components
        self.topic_model = BERTopic(
            embedding_model=self.embedding_model,
            umap_model=self.umap_model,
            hdbscan_model=self.hdbscan_model,
            vectorizer_model=vectorizer_model,
            nr_topics=nr_topics,
            calculate_probabilities=True,
            verbose=True
        )
        
        # Fit the model
        print("Fitting BERTopic model...")
        self.topics, self.probabilities = self.topic_model.fit_transform(self.texts, self.embeddings)
        
        # Get topic information
        self.topic_info = self.topic_model.get_topic_info()
        
        print(f"Model fitted successfully!")
        print(f"Number of topics found: {len(self.topic_info) - 1}")  # -1 for outlier topic
        print(f"Number of outliers: {sum(1 for t in self.topics if t == -1)}")
        
        return self
    
    def _fit_fallback(self, texts, nr_topics):
        """
        Fallback implementation when BERTopic is not available.
        """
        print("Using fallback topic modeling (TF-IDF + K-Means)...")
        
        self.texts = self.preprocess_texts(texts)
        
        # Use TF-IDF for embeddings
        vectorizer = TfidfVectorizer(
            max_features=1000,
            ngram_range=(1, 2),
            stop_words='english',
            min_df=1,
            max_df=0.9
        )
        
        tfidf_matrix = vectorizer.fit_transform(self.texts)
        
        # Reduce dimensionality
        svd = TruncatedSVD(n_components=min(50, len(self.texts)-1), random_state=self.random_state)
        self.embeddings = svd.fit_transform(tfidf_matrix)
        
        # Determine number of topics
        if nr_topics == 'auto':
            nr_topics = min(10, max(2, len(self.texts) // 10))
        elif isinstance(nr_topics, str):
            nr_topics = 5
        
        # Cluster documents
        kmeans = KMeans(n_clusters=nr_topics, random_state=self.random_state)
        self.topics = kmeans.fit_predict(self.embeddings)
        
        # Create topic info
        self.topic_info = self._create_fallback_topic_info(vectorizer, tfidf_matrix)
        
        print(f"Fallback model fitted with {nr_topics} topics")
        return self
    
    def _create_fallback_topic_info(self, vectorizer, tfidf_matrix):
        """
        Create topic information for fallback implementation.
        """
        feature_names = vectorizer.get_feature_names_out()
        topic_info = []
        
        for topic_id in range(max(self.topics) + 1):
            # Get documents in this topic
            topic_docs = [i for i, t in enumerate(self.topics) if t == topic_id]
            
            if topic_docs:
                # Calculate average TF-IDF for this topic
                topic_tfidf = tfidf_matrix[topic_docs].mean(axis=0).A1
                
                # Get top words
                top_indices = topic_tfidf.argsort()[-10:][::-1]
                top_words = [feature_names[i] for i in top_indices]
                
                topic_info.append({
                    'Topic': topic_id,
                    'Count': len(topic_docs),
                    'Name': '_'.join(top_words[:3]),
                    'Representation': top_words
                })
        
        return pd.DataFrame(topic_info)
    
    def get_topic_info(self):
        """
        Get information about discovered topics.
        """
        if self.topic_info is None:
            raise ValueError("Model not fitted. Please fit the model first.")
        
        return self.topic_info
    
    def get_topic_words(self, topic_id, n_words=10):
        """
        Get top words for a specific topic.
        """
        if not BERTOPIC_AVAILABLE:
            if topic_id < len(self.topic_info):
                return self.topic_info.iloc[topic_id]['Representation'][:n_words]
            return []
        
        if self.topic_model is None:
            raise ValueError("Model not fitted. Please fit the model first.")
        
        return [word for word, _ in self.topic_model.get_topic(topic_id)[:n_words]]
    
    def get_document_topics(self, texts=None, threshold=0.1):
        """
        Get topic assignments for documents.
        """
        if self.topics is None:
            raise ValueError("Model not fitted. Please fit the model first.")
        
        if texts is None:
            texts = self.texts
        
        doc_topics = []
        for i, (text, topic) in enumerate(zip(texts, self.topics)):
            topic_prob = 1.0  # Default probability
            
            if BERTOPIC_AVAILABLE and hasattr(self, 'probabilities') and self.probabilities is not None:
                if i < len(self.probabilities):
                    topic_probs = self.probabilities[i]
                    if topic >= 0 and topic < len(topic_probs):
                        topic_prob = topic_probs[topic]
            
            doc_topics.append({
                'text': text,
                'topic': topic,
                'probability': topic_prob,
                'is_outlier': topic == -1
            })
        
        return doc_topics
    
    def visualize_topics(self, width=800, height=600):
        """
        Create interactive topic visualization.
        """
        if BERTOPIC_AVAILABLE and self.topic_model is not None:
            # Use BERTopic's built-in visualization
            fig = self.topic_model.visualize_topics(width=width, height=height)
            fig.show()
        else:
            # Fallback visualization
            self._visualize_topics_fallback()
    
    def _visualize_topics_fallback(self):
        """
        Fallback visualization using matplotlib.
        """
        if self.embeddings is None:
            print("No embeddings available for visualization")
            return
        
        # Reduce to 2D for visualization
        tsne = TSNE(n_components=2, random_state=self.random_state, perplexity=min(30, len(self.texts)//4))
        embeddings_2d = tsne.fit_transform(self.embeddings)
        
        # Create scatter plot
        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                            c=self.topics, cmap='tab10', alpha=0.7)
        plt.colorbar(scatter, label='Topic')
        plt.title('Topic Visualization (t-SNE)')
        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')
        
        # Add topic labels
        for topic_id in range(max(self.topics) + 1):
            topic_points = embeddings_2d[np.array(self.topics) == topic_id]
            if len(topic_points) > 0:
                centroid = np.mean(topic_points, axis=0)
                plt.annotate(f'Topic {topic_id}', centroid,
                           xytext=(5, 5), textcoords='offset points',
                           bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7))
        
        plt.tight_layout()
        plt.show()
    
    def visualize_documents(self, custom_labels=None):
        """
        Visualize documents in topic space.
        """
        if BERTOPIC_AVAILABLE and self.topic_model is not None:
            fig = self.topic_model.visualize_documents(
                self.texts,
                custom_labels=custom_labels,
                width=1000,
                height=700
            )
            fig.show()
        else:
            print("Document visualization requires full BERTopic installation")
    
    def visualize_hierarchy(self):
        """
        Visualize topic hierarchy.
        """
        if BERTOPIC_AVAILABLE and self.topic_model is not None:
            fig = self.topic_model.visualize_hierarchy()
            fig.show()
        else:
            print("Hierarchy visualization requires full BERTopic installation")
    
    def visualize_barchart(self, topics=None, n_words=8):
        """
        Create bar chart of top words per topic.
        """
        if BERTOPIC_AVAILABLE and self.topic_model is not None:
            fig = self.topic_model.visualize_barchart(topics=topics, n_words=n_words)
            fig.show()
        else:
            self._visualize_barchart_fallback(n_words)
    
    def _visualize_barchart_fallback(self, n_words=8):
        """
        Fallback bar chart visualization.
        """
        if self.topic_info is None:
            return
        
        n_topics = min(6, len(self.topic_info))
        fig, axes = plt.subplots((n_topics + 1) // 2, 2, figsize=(15, 4 * ((n_topics + 1) // 2)))
        if n_topics == 1:
            axes = [axes]
        elif (n_topics + 1) // 2 == 1:
            axes = [axes]
        else:
            axes = axes.flatten()
        
        for i in range(n_topics):
            if i < len(self.topic_info):
                words = self.topic_info.iloc[i]['Representation'][:n_words]
                values = [1.0 - j*0.1 for j in range(len(words))]  # Dummy values
                
                axes[i].barh(range(len(words)), values)
                axes[i].set_yticks(range(len(words)))
                axes[i].set_yticklabels(words)
                axes[i].set_title(f'Topic {i}')
                axes[i].invert_yaxis()
        
        # Hide empty subplots
        for i in range(n_topics, len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.show()
    
    def search_topics(self, search_term, top_k=5):
        """
        Search for topics related to a search term.
        """
        if BERTOPIC_AVAILABLE and self.topic_model is not None:
            similar_topics, similarity_scores = self.topic_model.find_topics(search_term, top_n=top_k)
            return list(zip(similar_topics, similarity_scores))
        else:
            # Fallback search
            search_results = []
            for i, row in self.topic_info.iterrows():
                topic_words = ' '.join(row['Representation'])
                if search_term.lower() in topic_words.lower():
                    search_results.append((row['Topic'], 1.0))  # Dummy similarity
            return search_results[:top_k]
    
    def get_topic_evolution(self, timestamps=None, nr_bins=10):
        """
        Analyze topic evolution over time (if timestamps provided).
        """
        if timestamps is None or not BERTOPIC_AVAILABLE:
            print("Topic evolution requires timestamps and full BERTopic installation")
            return None
        
        if self.topic_model is not None:
            topics_over_time = self.topic_model.topics_over_time(
                self.texts, timestamps, nr_bins=nr_bins
            )
            return topics_over_time
    
    def save_model(self, filepath):
        """
        Save the trained model.
        """
        if BERTOPIC_AVAILABLE and self.topic_model is not None:
            self.topic_model.save(filepath)
            print(f"Model saved to {filepath}")
        else:
            print("Model saving requires full BERTopic installation")
    
    def export_results(self, filename='bertopic_results.csv'):
        """
        Export topic modeling results to CSV.
        """
        doc_topics = self.get_document_topics()
        
        # Create results DataFrame
        results_df = pd.DataFrame([
            {
                'text': doc['text'],
                'topic': doc['topic'],
                'probability': doc['probability'],
                'is_outlier': doc['is_outlier']
            }
            for doc in doc_topics
        ])
        
        # Add topic information
        topic_words = {}
        for i in range(max(self.topics) + 1):
            words = self.get_topic_words(i, n_words=5)
            topic_words[i] = ', '.join(words) if words else ''
        
        results_df['topic_words'] = results_df['topic'].map(topic_words)
        
        # Save to CSV
        results_df.to_csv(filename, index=False)
        
        # Save topic summary
        topic_summary = self.get_topic_info()
        topic_summary.to_csv(f'topic_summary_{filename}', index=False)
        
        print(f"Results exported to {filename}")
        return results_df

In [None]:
def demo_bertopic():
    """
    Demonstrate BERTopic pipeline with sample data.
    """
    # Sample texts covering different topics
    # sample_texts = [
    #     "Machine learning algorithms are revolutionizing healthcare diagnostics and treatment",
    #     "Climate change is causing severe weather patterns and environmental damage",
    #     "Stock markets are volatile due to economic uncertainty and inflation",
    #     "Deep learning models like BERT are transforming natural language processing",
    #     "Renewable energy sources are becoming more cost-effective than fossil fuels",
    #     "Cryptocurrency adoption is growing despite regulatory challenges",
    #     "Computer vision technology is improving autonomous vehicle safety",
    #     "Global warming is melting polar ice caps and raising sea levels",
    #     "Financial markets react strongly to central bank policy decisions",
    #     "Neural networks are enabling breakthrough advances in AI research",
    #     "Environmental conservation efforts focus on protecting endangered species",
    #     "Investment portfolios need diversification to manage risk effectively",
    #     "Natural language understanding enables better human-computer interaction",
    #     "Climate policies aim to reduce carbon emissions and promote sustainability",
    #     "Algorithmic trading is changing how financial markets operate",
    #     "Artificial intelligence is transforming industries across the globe",
    #     "Ocean temperatures are rising due to greenhouse gas emissions",
    #     "Economic indicators suggest potential recession in major economies",
    #     "Transformer models have revolutionized machine translation systems",
    #     "Solar and wind energy are becoming dominant renewable technologies",
    #     "Large language models are reshaping content creation and digital communication",
    #     "Electric vehicles are driving innovation in battery technology and infrastructure",
    #     "Facial recognition systems raise ethical concerns about privacy and surveillance",
    #     "Quantum computing promises exponential speedup for complex problem solving",
    #     "Global supply chain disruptions are affecting manufacturing and retail sectors",
    #     "Gene editing technologies like CRISPR are transforming biotechnology research",
    #     "Cybersecurity threats are increasing as critical infrastructure becomes digital",
    #     "Data-driven agriculture improves crop yields and sustainability outcomes",
    #     "Space exploration is accelerating through public-private sector collaboration",
    #     "Biometric authentication is replacing traditional password-based security",
    #     "AI ethics frameworks are emerging to address bias and accountability in systems",
    #     "Telemedicine adoption is expanding healthcare access in rural communities",
    #     "Decentralized finance platforms challenge traditional banking systems",
    #     "Edge computing enables faster processing for IoT and real-time analytics",
    #     "Satellite data is advancing climate monitoring and disaster prediction",
    #     "Privacy-preserving machine learning protects sensitive user information",
    #     "Robotic process automation streamlines repetitive enterprise workflows",
    #     "Digital twins simulate real-world systems for better decision-making",
    #     "5G networks facilitate high-speed connectivity for smart cities and devices",
    #     "Augmented reality is transforming retail, education, and remote collaboration",
    #     "Blockchain technology is enabling secure and transparent digital transactions",
    #     "Artificial intelligence is optimizing logistics and supply chain operations",
    #     "Microplastics pollution poses serious risks to marine ecosystems and human health",
    #     "Virtual reality is enhancing simulation training in medicine and aviation",
    #     "Green hydrogen is emerging as a key solution for clean energy storage",
    #     "Machine learning accelerates drug discovery and personalized treatment plans",
    #     "Social media algorithms influence public opinion and political discourse",
    #     "Internet of Things devices are transforming home automation and smart living",
    #     "Climate adaptation strategies are vital for coastal city resilience",
    #     "Predictive analytics is improving maintenance in industrial manufacturing",
    #     "Autonomous drones are revolutionizing delivery services and aerial monitoring",
    #     "Remote work technologies are reshaping corporate culture and productivity",
    #     "Carbon capture and storage could mitigate industrial greenhouse gas emissions",
    #     "AI-generated content raises questions about authorship and misinformation",
    #     "Smart grids help balance electricity demand and integrate renewable sources",
    #     "Synthetic biology is engineering organisms for sustainable material production",
    #     "Economic inequality is widening due to uneven access to digital resources"
    # ]
    df = pd.read_csv('india2013.csv')
    sample_texts = df['Paragraph'].tolist()

    
    print("BERTopic Pipeline Demo")
    print("=" * 50)
    
    # Initialize pipeline
    pipeline = BERTopicPipeline(embedding_model='all-MiniLM-L6-v2')
    
    # Fit model
    pipeline.fit_bertopic(sample_texts, nr_topics='auto')
    
    # Display topic information
    print("\nTopic Information:")
    print("=" * 30)
    topic_info = pipeline.get_topic_info()
    print(topic_info)
    
    # Show top words for each topic
    print(f"\nTop Words per Topic:")
    print("=" * 30)
    for topic_id in range(len(topic_info)):
        if topic_id < max(pipeline.topics) + 1:
            words = pipeline.get_topic_words(topic_id, n_words=6)
            print(f"Topic {topic_id}: {', '.join(words)}")
    
    # Document topic assignments
    doc_topics = pipeline.get_document_topics()
    print(f"\nSample Document Assignments:")
    print("=" * 35)
    for i, doc in enumerate(doc_topics[:5]):
        print(f"Text: {doc['text'][:50]}...")
        print(f"Topic: {doc['topic']} (prob: {doc['probability']:.3f})")
        print()
    
    # Visualizations
    print("Creating visualizations...")
    # pipeline.visualize_topics()
    pipeline.visualize_barchart()
    
    # Search functionality
    print(f"\nTopic Search Results for 'machine learning':")
    search_results = pipeline.search_topics('machine learning', top_k=3)
    for topic_id, score in search_results:
        print(f"Topic {topic_id}: {score:.3f}")
    
    # Export results
    results_df = pipeline.export_results('demo_bertopic_results.csv')
    print(f"\nExported {len(results_df)} document results")
    
    return pipeline

# Quick setup function for easy use
def quick_bertopic_analysis(texts, nr_topics='auto', model_name='all-MiniLM-L6-v2'):
    """
    Quick function to perform BERTopic analysis with minimal setup.
    """
    pipeline = BERTopicPipeline(embedding_model=model_name)
    pipeline.fit_bertopic(texts, nr_topics=nr_topics)
    
    # Print summary
    topic_info = pipeline.get_topic_info()
    print(f"Found {len(topic_info)} topics in {len(texts)} documents")
    
    for i, row in topic_info.iterrows():
        if i < max(pipeline.topics) + 1:
            words = pipeline.get_topic_words(i, n_words=5)
            print(f"Topic {i}: {', '.join(words)}")
    
    pipeline.visualize_topics()
    return pipeline

In [None]:
if __name__ == "__main__":
    # Check if BERTopic is available
    if BERTOPIC_AVAILABLE:
        print("Full BERTopic functionality available")
        demo_pipeline = demo_bertopic()
    else:
        print("Running with fallback implementation")
        print("To use full BERTopic features, install: pip install bertopic sentence-transformers umap-learn hdbscan")
        demo_pipeline = demo_bertopic()

NameError: name 'BERTOPIC_AVAILABLE' is not defined