In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

# Try to import NLTK
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer, SnowballStemmer
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize, sent_tokenize
    from nltk.tag import pos_tag
    
    # Download required NLTK data
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    NLTK_AVAILABLE = True
    print("NLTK available with all required datasets!")
except ImportError:
    NLTK_AVAILABLE = False
    print("NLTK not available - will use simplified implementations")

# Set style and random seed
plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Create sample text corpus for demonstration
def create_sample_corpus():
    """Create a diverse text corpus for NLP demonstration"""
    
    documents = [
        # Technology
        "Machine learning algorithms can automatically improve through experience and data analysis. "
        "Deep learning networks use artificial neural networks with multiple layers to model complex patterns.",
        
        # Science
        "The theory of relativity revolutionized our understanding of space, time, and gravity. "
        "Quantum mechanics describes the behavior of matter and energy at the molecular, atomic, nuclear levels.",
        
        # Medicine
        "Medical researchers are developing new treatments for cancer using immunotherapy approaches. "
        "Precision medicine uses genetic information to tailor treatments to individual patients.",
        
        # Business
        "Digital transformation is changing how companies operate and deliver value to customers. "
        "Data-driven decision making helps organizations optimize their business processes and strategies.",
        
        # Education
        "Online learning platforms are making education more accessible to students worldwide. "
        "Artificial intelligence is being used to personalize learning experiences for different students.",
        
        # Environment
        "Climate change is affecting global weather patterns and ecosystem sustainability. "
        "Renewable energy sources like solar and wind power are becoming more cost-effective solutions.",
        
        # Sports
        "Athletic performance analysis uses advanced statistics and biomechanical measurements. "
        "Sports teams employ data scientists to analyze player performance and game strategies.",
        
        # Arts
        "Digital art and computer graphics are creating new forms of creative expression. "
        "Artists are using machine learning to generate novel artistic styles and compositions."
    ]
    
    labels = ['Technology', 'Science', 'Medicine', 'Business', 
              'Education', 'Environment', 'Sports', 'Arts']
    
    return documents, labels

# Create sample corpus
corpus, document_labels = create_sample_corpus()

print("Sample Corpus Created:")
print(f"Number of documents: {len(corpus)}")
print(f"Categories: {document_labels}")
print("\\nFirst document preview:")
print(f'"{corpus[0][:100]}..."')


In [None]:
# Comprehensive NLP preprocessing pipeline
class NLPPipeline:
    """Complete NLP preprocessing pipeline with multiple options"""
    
    def __init__(self, use_nltk=True):
        self.use_nltk = use_nltk and NLTK_AVAILABLE
        self.processing_stats = {}
        
        if self.use_nltk:
            self.stemmer = PorterStemmer()
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
        else:
            # Simple implementations when NLTK is not available
            self.stop_words = {
                'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 
                'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
                'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 
                'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 
                'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 
                'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
                'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
                'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after', 'above', 
                'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
                'further', 'then', 'once'
            }
    
    def clean_text(self, text):
        """Basic text cleaning"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\\s+', ' ', text)
        
        # Remove special characters (keep basic punctuation)
        text = re.sub(r'[^a-zA-Z0-9\\s.,!?;:-]', '', text)
        
        # Remove multiple punctuation
        text = re.sub(r'[.,!?;:-]+', lambda m: m.group(0)[0], text)
        
        return text.strip()
    
    def simple_tokenize(self, text):
        """Simple tokenization when NLTK is not available"""
        # Remove punctuation and split on whitespace
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text.split()
    
    def simple_stem(self, word):
        """Very simple stemming - remove common suffixes"""
        suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'ness', 'ment']
        
        for suffix in suffixes:
            if word.endswith(suffix) and len(word) > len(suffix) + 2:
                return word[:-len(suffix)]
        return word
    
    def tokenize(self, text):
        """Tokenize text into words"""
        if self.use_nltk:
            return word_tokenize(text)
        else:
            return self.simple_tokenize(text)
    
    def remove_stopwords(self, tokens):
        """Remove stop words from token list"""
        return [token for token in tokens if token.lower() not in self.stop_words]
    
    def stem_tokens(self, tokens):
        """Apply stemming to tokens"""
        if self.use_nltk:
            return [self.stemmer.stem(token) for token in tokens]
        else:
            return [self.simple_stem(token) for token in tokens]
    
    def lemmatize_tokens(self, tokens):
        """Apply lemmatization to tokens"""
        if self.use_nltk:
            return [self.lemmatizer.lemmatize(token) for token in tokens]
        else:
            # Use stemming as approximation
            return self.stem_tokens(tokens)
    
    def process_document(self, text, steps=['clean', 'tokenize', 'stopwords', 'lemmatize']):
        """Process a single document through the pipeline"""
        
        original_length = len(text.split())
        
        # Track processing steps
        result = {'original': text}
        
        if 'clean' in steps:
            text = self.clean_text(text)
            result['cleaned'] = text
        
        if 'tokenize' in steps:
            tokens = self.tokenize(text)
            result['tokens'] = tokens
        else:
            tokens = text.split()
        
        if 'stopwords' in steps:
            tokens = self.remove_stopwords(tokens)
            result['no_stopwords'] = tokens
        
        if 'stem' in steps:
            tokens = self.stem_tokens(tokens)
            result['stemmed'] = tokens
        
        if 'lemmatize' in steps:
            tokens = self.lemmatize_tokens(tokens)
            result['lemmatized'] = tokens
        
        # Final processed text
        result['final'] = ' '.join(tokens)
        result['final_tokens'] = tokens
        
        # Statistics
        result['stats'] = {
            'original_words': original_length,
            'final_words': len(tokens),
            'reduction_ratio': 1 - (len(tokens) / original_length) if original_length > 0 else 0
        }
        
        return result
    
    def process_corpus(self, documents, steps=['clean', 'tokenize', 'stopwords', 'lemmatize']):
        """Process entire corpus"""
        processed_docs = []
        total_stats = {'original_words': 0, 'final_words': 0}
        
        print(f"Processing {len(documents)} documents...")
        
        for i, doc in enumerate(documents):
            processed = self.process_document(doc, steps)
            processed_docs.append(processed)
            
            # Accumulate statistics
            total_stats['original_words'] += processed['stats']['original_words']
            total_stats['final_words'] += processed['stats']['final_words']
            
            if (i + 1) % 2 == 0:
                print(f"  Processed {i + 1}/{len(documents)} documents")
        
        # Calculate overall statistics
        total_stats['reduction_ratio'] = 1 - (total_stats['final_words'] / total_stats['original_words'])
        
        self.processing_stats = total_stats
        print(f"\\nProcessing complete!")
        print(f"Total word reduction: {total_stats['reduction_ratio']:.2%}")
        
        return processed_docs

# Initialize and demonstrate NLP pipeline
print("=== NLP Pipeline Demonstration ===")

nlp_pipeline = NLPPipeline(use_nltk=NLTK_AVAILABLE)

# Process a single document to show step-by-step
print("\\n1. Single Document Processing:")
sample_doc = corpus[0]
print(f"Original: {sample_doc}")

steps_demo = ['clean', 'tokenize', 'stopwords', 'lemmatize']
processed_sample = nlp_pipeline.process_document(sample_doc, steps_demo)

print(f"\\nCleaned: {processed_sample['cleaned']}")
print(f"Tokens (first 10): {processed_sample['tokens'][:10]}")
print(f"No stopwords (first 10): {processed_sample['no_stopwords'][:10]}")
print(f"Lemmatized (first 10): {processed_sample['lemmatized'][:10]}")
print(f"Final: {processed_sample['final']}")
print(f"Word reduction: {processed_sample['stats']['reduction_ratio']:.2%}")

# Process entire corpus
print("\\n2. Full Corpus Processing:")
processed_corpus = nlp_pipeline.process_corpus(corpus)


In [None]:
# Comprehensive NLP preprocessing pipeline
class NLPPipeline:
    """Complete NLP preprocessing pipeline with multiple options"""
    
    def __init__(self, use_nltk=True):
        self.use_nltk = use_nltk and NLTK_AVAILABLE
        self.processing_stats = {}
        
        if self.use_nltk:
            self.stemmer = PorterStemmer()
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
        else:
            # Simple implementations when NLTK is not available
            self.stop_words = {
                'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 
                'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
                'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 
                'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 
                'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 
                'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
                'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
                'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after', 'above', 
                'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
                'further', 'then', 'once'
            }
    
    def clean_text(self, text):
        """Basic text cleaning"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\\s+', ' ', text)
        
        # Remove special characters (keep basic punctuation)
        text = re.sub(r'[^a-zA-Z0-9\\s.,!?;:-]', '', text)
        
        # Remove multiple punctuation
        text = re.sub(r'[.,!?;:-]+', lambda m: m.group(0)[0], text)
        
        return text.strip()
    
    def simple_tokenize(self, text):
        """Simple tokenization when NLTK is not available"""
        # Remove punctuation and split on whitespace
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text.split()
    
    def simple_stem(self, word):
        """Very simple stemming - remove common suffixes"""
        suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'ness', 'ment']
        
        for suffix in suffixes:
            if word.endswith(suffix) and len(word) > len(suffix) + 2:
                return word[:-len(suffix)]
        return word
    
    def tokenize(self, text):
        """Tokenize text into words"""
        if self.use_nltk:
            return word_tokenize(text)
        else:
            return self.simple_tokenize(text)
    
    def remove_stopwords(self, tokens):
        """Remove stop words from token list"""
        return [token for token in tokens if token.lower() not in self.stop_words]
    
    def stem_tokens(self, tokens):
        """Apply stemming to tokens"""
        if self.use_nltk:
            return [self.stemmer.stem(token) for token in tokens]
        else:
            return [self.simple_stem(token) for token in tokens]
    
    def lemmatize_tokens(self, tokens):
        """Apply lemmatization to tokens"""
        if self.use_nltk:
            return [self.lemmatizer.lemmatize(token) for token in tokens]
        else:
            # Use stemming as approximation
            return self.stem_tokens(tokens)
    
    def process_document(self, text, steps=['clean', 'tokenize', 'stopwords', 'lemmatize']):
        """Process a single document through the pipeline"""
        
        original_length = len(text.split())
        
        # Track processing steps
        result = {'original': text}
        
        if 'clean' in steps:
            text = self.clean_text(text)
            result['cleaned'] = text
        
        if 'tokenize' in steps:
            tokens = self.tokenize(text)
            result['tokens'] = tokens
        else:
            tokens = text.split()
        
        if 'stopwords' in steps:
            tokens = self.remove_stopwords(tokens)
            result['no_stopwords'] = tokens
        
        if 'stem' in steps:
            tokens = self.stem_tokens(tokens)
            result['stemmed'] = tokens
        
        if 'lemmatize' in steps:
            tokens = self.lemmatize_tokens(tokens)
            result['lemmatized'] = tokens
        
        # Final processed text
        result['final'] = ' '.join(tokens)
        result['final_tokens'] = tokens
        
        # Statistics
        result['stats'] = {
            'original_words': original_length,
            'final_words': len(tokens),
            'reduction_ratio': 1 - (len(tokens) / original_length) if original_length > 0 else 0
        }
        
        return result
    
    def process_corpus(self, documents, steps=['clean', 'tokenize', 'stopwords', 'lemmatize']):
        """Process entire corpus"""
        processed_docs = []
        total_stats = {'original_words': 0, 'final_words': 0}
        
        print(f"Processing {len(documents)} documents...")
        
        for i, doc in enumerate(documents):
            processed = self.process_document(doc, steps)
            processed_docs.append(processed)
            
            # Accumulate statistics
            total_stats['original_words'] += processed['stats']['original_words']
            total_stats['final_words'] += processed['stats']['final_words']
            
            if (i + 1) % 2 == 0:
                print(f"  Processed {i + 1}/{len(documents)} documents")
        
        # Calculate overall statistics
        total_stats['reduction_ratio'] = 1 - (total_stats['final_words'] / total_stats['original_words'])
        
        self.processing_stats = total_stats
        print(f"\\nProcessing complete!")
        print(f"Total word reduction: {total_stats['reduction_ratio']:.2%}")
        
        return processed_docs

# Initialize and demonstrate NLP pipeline
print("=== NLP Pipeline Demonstration ===")

nlp_pipeline = NLPPipeline(use_nltk=NLTK_AVAILABLE)

# Process a single document to show step-by-step
print("\\n1. Single Document Processing:")
sample_doc = corpus[0]
print(f"Original: {sample_doc}")

steps_demo = ['clean', 'tokenize', 'stopwords', 'lemmatize']
processed_sample = nlp_pipeline.process_document(sample_doc, steps_demo)

print(f"\\nCleaned: {processed_sample['cleaned']}")
print(f"Tokens (first 10): {processed_sample['tokens'][:10]}")
print(f"No stopwords (first 10): {processed_sample['no_stopwords'][:10]}")
print(f"Lemmatized (first 10): {processed_sample['lemmatized'][:10]}")
print(f"Final: {processed_sample['final']}")
print(f"Word reduction: {processed_sample['stats']['reduction_ratio']:.2%}")

# Process entire corpus
print("\\n2. Full Corpus Processing:")
processed_corpus = nlp_pipeline.process_corpus(corpus)


In [None]:
# Text vectorization and analysis
class TextVectorizer:
    """Text vectorization using multiple methods"""
    
    def __init__(self):
        self.vectorizers = {}
        self.feature_names = {}
        self.vocabulary = {}
    
    def bag_of_words(self, documents, max_features=1000):
        """Create Bag of Words representation"""
        vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, 2))
        bow_matrix = vectorizer.fit_transform(documents)
        
        self.vectorizers['bow'] = vectorizer
        self.feature_names['bow'] = vectorizer.get_feature_names_out()
        
        return bow_matrix.toarray()
    
    def tfidf_vectorization(self, documents, max_features=1000):
        """Create TF-IDF representation"""
        vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, 2))
        tfidf_matrix = vectorizer.fit_transform(documents)
        
        self.vectorizers['tfidf'] = vectorizer
        self.feature_names['tfidf'] = vectorizer.get_feature_names_out()
        
        return tfidf_matrix.toarray()
    
    def analyze_vocabulary(self, documents):
        """Analyze vocabulary statistics"""
        all_words = []
        for doc in documents:
            all_words.extend(doc.split())
        
        word_freq = Counter(all_words)
        
        stats = {
            'total_words': len(all_words),
            'unique_words': len(word_freq),
            'vocabulary_richness': len(word_freq) / len(all_words),
            'most_common': word_freq.most_common(10),
            'singleton_ratio': sum(1 for count in word_freq.values() if count == 1) / len(word_freq)
        }
        
        return stats, word_freq
    
    def document_similarity(self, doc_vectors, method='cosine'):
        """Calculate document similarity matrix"""
        if method == 'cosine':
            similarity_matrix = cosine_similarity(doc_vectors)
        else:
            # Euclidean distance converted to similarity
            from sklearn.metrics.pairwise import euclidean_distances
            distances = euclidean_distances(doc_vectors)
            similarity_matrix = 1 / (1 + distances)
        
        return similarity_matrix

# Apply text vectorization
print("\\n=== Text Vectorization Analysis ===")

vectorizer = TextVectorizer()

# Get processed documents
processed_texts = [doc['final'] for doc in processed_corpus]

# Vocabulary analysis
print("\\n1. Vocabulary Analysis:")
vocab_stats, word_frequencies = vectorizer.analyze_vocabulary(processed_texts)

print(f"Total words: {vocab_stats['total_words']}")
print(f"Unique words: {vocab_stats['unique_words']}")
print(f"Vocabulary richness: {vocab_stats['vocabulary_richness']:.3f}")
print(f"Singleton ratio: {vocab_stats['singleton_ratio']:.3f}")
print("\\nMost common words:")
for word, count in vocab_stats['most_common']:
    print(f"  {word}: {count}")

# Bag of Words
print("\\n2. Bag of Words Vectorization:")
bow_vectors = vectorizer.bag_of_words(processed_texts, max_features=50)
print(f"BoW matrix shape: {bow_vectors.shape}")

# TF-IDF
print("\\n3. TF-IDF Vectorization:")
tfidf_vectors = vectorizer.tfidf_vectorization(processed_texts, max_features=50)
print(f"TF-IDF matrix shape: {tfidf_vectors.shape}")

# Document similarity analysis
print("\\n4. Document Similarity Analysis:")
bow_similarity = vectorizer.document_similarity(bow_vectors)
tfidf_similarity = vectorizer.document_similarity(tfidf_vectors)

print(f"BoW similarity matrix shape: {bow_similarity.shape}")
print(f"TF-IDF similarity matrix shape: {tfidf_similarity.shape}")

# Visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Word frequency distribution
word_counts = list(word_frequencies.values())
axes[0, 0].hist(word_counts, bins=20, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Word Frequency Distribution')
axes[0, 0].set_xlabel('Frequency')
axes[0, 0].set_ylabel('Number of Words')
axes[0, 0].set_yscale('log')

# Most common words
common_words = [word for word, _ in vocab_stats['most_common']]
common_counts = [count for _, count in vocab_stats['most_common']]
axes[0, 1].bar(range(len(common_words)), common_counts)
axes[0, 1].set_title('Most Common Words')
axes[0, 1].set_xlabel('Words')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_xticks(range(len(common_words)))
axes[0, 1].set_xticklabels(common_words, rotation=45)

# Document length distribution
doc_lengths = [len(doc.split()) for doc in processed_texts]
axes[0, 2].hist(doc_lengths, bins=10, alpha=0.7, edgecolor='black')
axes[0, 2].set_title('Document Length Distribution')
axes[0, 2].set_xlabel('Number of Words')
axes[0, 2].set_ylabel('Number of Documents')

# BoW similarity heatmap
im1 = axes[1, 0].imshow(bow_similarity, cmap='viridis', aspect='auto')
axes[1, 0].set_title('BoW Similarity Matrix')
axes[1, 0].set_xlabel('Documents')
axes[1, 0].set_ylabel('Documents')
plt.colorbar(im1, ax=axes[1, 0])

# TF-IDF similarity heatmap
im2 = axes[1, 1].imshow(tfidf_similarity, cmap='viridis', aspect='auto')
axes[1, 1].set_title('TF-IDF Similarity Matrix')
axes[1, 1].set_xlabel('Documents')
axes[1, 1].set_ylabel('Documents')
plt.colorbar(im2, ax=axes[1, 1])

# Similarity comparison
bow_avg_sim = np.mean(bow_similarity[np.triu_indices_from(bow_similarity, k=1)])
tfidf_avg_sim = np.mean(tfidf_similarity[np.triu_indices_from(tfidf_similarity, k=1)])

methods = ['BoW', 'TF-IDF']
avg_similarities = [bow_avg_sim, tfidf_avg_sim]

bars = axes[1, 2].bar(methods, avg_similarities, alpha=0.7)
axes[1, 2].set_title('Average Document Similarity')
axes[1, 2].set_ylabel('Average Similarity')

# Add value labels
for bar, sim in zip(bars, avg_similarities):
    axes[1, 2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                   f'{sim:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f"\\nAverage BoW similarity: {bow_avg_sim:.3f}")
print(f"Average TF-IDF similarity: {tfidf_avg_sim:.3f}")


In [None]:
# Text clustering and dimensionality reduction
def text_clustering_analysis(vectors, labels, method_name):
    """Perform clustering analysis on text vectors"""
    
    print(f"\\n=== {method_name} Clustering Analysis ===")
    
    # K-means clustering
    n_clusters = len(set(labels))  # Number of categories
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_assignments = kmeans.fit_predict(vectors)
    
    # Dimensionality reduction for visualization
    if vectors.shape[1] > 2:
        # Use PCA for initial reduction
        if vectors.shape[1] > 50:
            pca = PCA(n_components=50)
            vectors_reduced = pca.fit_transform(vectors)
        else:
            vectors_reduced = vectors
        
        # t-SNE for 2D visualization
        tsne = TSNE(n_components=2, random_state=42, perplexity=min(5, len(vectors)-1))
        vectors_2d = tsne.fit_transform(vectors_reduced)
    else:
        vectors_2d = vectors
    
    # Calculate clustering metrics
    from sklearn.metrics import adjusted_rand_score, silhouette_score
    
    # Convert text labels to numeric for comparison
    label_to_num = {label: i for i, label in enumerate(set(labels))}
    true_labels = [label_to_num[label] for label in labels]
    
    ari_score = adjusted_rand_score(true_labels, cluster_assignments)
    silhouette_avg = silhouette_score(vectors, cluster_assignments)
    
    print(f"Adjusted Rand Index: {ari_score:.3f}")
    print(f"Silhouette Score: {silhouette_avg:.3f}")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot by true categories
    unique_labels = list(set(labels))
    colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labels)))
    
    for i, label in enumerate(unique_labels):
        mask = [l == label for l in labels]
        axes[0].scatter(vectors_2d[mask, 0], vectors_2d[mask, 1], 
                       c=[colors[i]], label=label, alpha=0.7, s=50)
    
    axes[0].set_title(f'{method_name} - True Categories')
    axes[0].set_xlabel('t-SNE Component 1')
    axes[0].set_ylabel('t-SNE Component 2')
    axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Plot by clusters
    cluster_colors = plt.cm.Set1(np.linspace(0, 1, n_clusters))
    
    for i in range(n_clusters):
        mask = cluster_assignments == i
        axes[1].scatter(vectors_2d[mask, 0], vectors_2d[mask, 1], 
                       c=[cluster_colors[i]], label=f'Cluster {i}', alpha=0.7, s=50)
    
    axes[1].set_title(f'{method_name} - K-means Clusters')
    axes[1].set_xlabel('t-SNE Component 1')
    axes[1].set_ylabel('t-SNE Component 2')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    # Analyze cluster composition
    print("\\nCluster Composition:")
    for i in range(n_clusters):
        cluster_mask = cluster_assignments == i
        cluster_labels = [labels[j] for j, mask in enumerate(cluster_mask) if mask]
        label_counts = Counter(cluster_labels)
        print(f"  Cluster {i}: {dict(label_counts)}")
    
    return {
        'cluster_assignments': cluster_assignments,
        'vectors_2d': vectors_2d,
        'ari_score': ari_score,
        'silhouette_score': silhouette_avg
    }

# Perform clustering analysis
print("=== Text Clustering and Visualization ===")

# Clustering with BoW
bow_results = text_clustering_analysis(bow_vectors, document_labels, "Bag of Words")

# Clustering with TF-IDF
tfidf_results = text_clustering_analysis(tfidf_vectors, document_labels, "TF-IDF")

# Comparison summary
print("\\n=== Clustering Comparison Summary ===")
print(f"BoW - ARI: {bow_results['ari_score']:.3f}, Silhouette: {bow_results['silhouette_score']:.3f}")
print(f"TF-IDF - ARI: {tfidf_results['ari_score']:.3f}, Silhouette: {tfidf_results['silhouette_score']:.3f}")

# Feature importance analysis
print("\\n=== Feature Importance Analysis ===")

# Get top TF-IDF features
tfidf_feature_names = vectorizer.feature_names['tfidf']
tfidf_means = np.mean(tfidf_vectors, axis=0)
top_indices = np.argsort(tfidf_means)[-10:][::-1]

print("Top 10 TF-IDF features (by average importance):")
for i, idx in enumerate(top_indices):
    print(f"  {i+1}. {tfidf_feature_names[idx]}: {tfidf_means[idx]:.3f}")

# Document-specific analysis
print("\\nDocument-specific high TF-IDF terms:")
for doc_idx, (doc_label, tfidf_vec) in enumerate(zip(document_labels, tfidf_vectors)):
    top_features_idx = np.argsort(tfidf_vec)[-3:][::-1]
    top_features = [(tfidf_feature_names[i], tfidf_vec[i]) for i in top_features_idx]
    print(f"  {doc_label}: {', '.join([f'{term}({score:.2f})' for term, score in top_features])}")

# Visualization of top features
plt.figure(figsize=(12, 6))
top_features = [tfidf_feature_names[idx] for idx in top_indices]
top_scores = [tfidf_means[idx] for idx in top_indices]

bars = plt.bar(range(len(top_features)), top_scores, alpha=0.7)
plt.title('Top 10 TF-IDF Features by Average Importance')
plt.xlabel('Features')
plt.ylabel('Average TF-IDF Score')
plt.xticks(range(len(top_features)), top_features, rotation=45, ha='right')

# Add value labels
for bar, score in zip(bars, top_scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
             f'{score:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\\nNLP fundamentals pipeline analysis completed!")
