# Word2Vec using Gensim

In [None]:
import sys
import os

# Quick check for packages without subprocess (much faster)
def check_packages():
    """Quick check for required packages."""
    missing = []
    optional_missing = []
    
    # Core packages (required)
    for package in ['nltk', 'gensim', 'scipy', 'numpy']:
        try:
            __import__(package)
        except ImportError:
            missing.append(package)
    
    # Visualization package (optional)
    try:
        __import__('matplotlib')
        HAS_MATPLOTLIB = True
    except ImportError:
        optional_missing.append('matplotlib')
        HAS_MATPLOTLIB = False
    
    if missing:
        print("="*60)
        print("‚ö†Ô∏è  MISSING REQUIRED PACKAGES")
        print("="*60)
        print(f"\nMissing: {', '.join(missing)}")
        print(f"\nüí° Install with: pip install {' '.join(missing)}")
        sys.exit(1)
    
    if optional_missing:
        print("="*60)
        print("‚ÑπÔ∏è  OPTIONAL PACKAGES MISSING")
        print("="*60)
        print(f"\nMissing: matplotlib (for visualization)")
        print(f"üí° Install with: pip install matplotlib")
        print(f"\n‚úÖ Running in TEXT-ONLY mode (no visualization)")
        print("="*60)
    
    return HAS_MATPLOTLIB

HAS_MATPLOTLIB = check_packages()

# Fast imports
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr, pearsonr
import numpy as np
import heapq

if HAS_MATPLOTLIB:
    import matplotlib.pyplot as plt

# Download NLTK data only if needed
try:
    word_tokenize("test")
except LookupError:
    print("üì• Downloading NLTK punkt_tab...")
    nltk.download('punkt_tab', quiet=True)
    print("üì• Downloading NLTK punkt...")
    nltk.download('punkt', quiet=True)

try:
    stopwords.words('english')
except LookupError:
    print("üì• Downloading stopwords...")
    nltk.download('stopwords', quiet=True)


def manual_pca_2d(vectors):
    """
    FAST Manual PCA using SVD (much faster than eigenvalue decomposition).
    
    Args:
        vectors: numpy array of shape (n_samples, n_features)
    
    Returns:
        numpy array of shape (n_samples, 2) with PCA-reduced coordinates
    """
    mean = np.mean(vectors, axis=0)
    centered = vectors - mean
    
    # Use SVD for fast PCA
    U, S, Vt = np.linalg.svd(centered, full_matrices=False)
    pca_result = U[:, :2] * S[:2]
    
    return pca_result


def calculate_metrics(true_positives, false_positives, false_negatives, true_negatives=None):
    """
    Calculate precision, recall, F1-score, and accuracy.
    
    Args:
        true_positives: Number of true positive predictions
        false_positives: Number of false positive predictions
        false_negatives: Number of false negative predictions
        true_negatives: Number of true negative predictions (optional)
    
    Returns:
        Dictionary with precision, recall, f1_score, and accuracy
    """
    metrics = {}
    
    # Precision: TP / (TP + FP)
    if true_positives + false_positives > 0:
        metrics['precision'] = true_positives / (true_positives + false_positives)
    else:
        metrics['precision'] = 0.0
    
    # Recall: TP / (TP + FN)
    if true_positives + false_negatives > 0:
        metrics['recall'] = true_positives / (true_positives + false_negatives)
    else:
        metrics['recall'] = 0.0
    
    # F1-Score: 2 * (Precision * Recall) / (Precision + Recall)
    if metrics['precision'] + metrics['recall'] > 0:
        metrics['f1_score'] = 2 * (metrics['precision'] * metrics['recall']) / (metrics['precision'] + metrics['recall'])
    else:
        metrics['f1_score'] = 0.0
    
    # Accuracy: (TP + TN) / (TP + TN + FP + FN)
    if true_negatives is not None:
        total = true_positives + true_negatives + false_positives + false_negatives
        if total > 0:
            metrics['accuracy'] = (true_positives + true_negatives) / total
        else:
            metrics['accuracy'] = 0.0
    else:
        metrics['accuracy'] = None
    
    return metrics


class Word2VecEvaluator:
    """
    Optimized Word2Vec model evaluation toolkit.
    
    Features:
    - Vocabulary exploration and statistics
    - Analogy evaluation
    - Similarity testing with correlation metrics
    - Categorization assessment
    - 2D visualizations using fast PCA
    - Interactive testing modes
    - Vector caching for improved performance
    
    Example:
        >>> evaluator = Word2VecEvaluator(model_path='model.bin')
        >>> evaluator.show_vocabulary(limit=20)
        >>> evaluator.visualize_word_neighbors('king', topn=15)
        >>> analogies = [("king", "man", "queen", "woman")]
        >>> metrics, results = evaluator.evaluate_analogies(analogies)
    """
    
    def __init__(self, model_path=None, model=None, cache_size=1000):
        """
        Initialize the evaluator.
        
        Args:
            model_path: Path to saved Word2Vec model
            model: Pre-loaded Word2Vec model object
            cache_size: Maximum number of word vectors to cache (default: 1000)
        """
        if model is not None:
            self.model = model
        elif model_path and os.path.exists(model_path):
            self.model = Word2Vec.load(model_path)
        else:
            raise ValueError("Must provide either model_path or model object")
        
        self.stop_words = set(stopwords.words('english'))
        self._word_cache = {}
        self._cache_size = cache_size
    
    def _get_vector(self, word):
        """Get word vector with caching for performance."""
        if word not in self._word_cache:
            if word not in self.model.wv:
                return None
            
            # Clear cache if too large
            if len(self._word_cache) >= self._cache_size:
                remove_count = self._cache_size // 5
                for _ in range(remove_count):
                    self._word_cache.popitem()
            
            self._word_cache[word] = self.model.wv[word]
        
        return self._word_cache[word]
    
    def show_vocabulary(self, limit=50, start_from=0):
        """Display vocabulary words with indices."""
        print(f"\nüìö Vocabulary (showing {limit} words starting from {start_from}):")
        print("="*60)
        
        vocab = self.model.wv.index_to_key
        end = min(start_from + limit, len(vocab))
        
        for i in range(start_from, end):
            word = vocab[i]
            print(f"  {i:4d}. {word}")
        
        print(f"\nüìä Total vocabulary: {len(vocab):,} words")
        
        if end < len(vocab):
            print(f"üí° Use show_vocabulary(limit={limit}, start_from={end}) to see more")
    
    def search_vocabulary(self, pattern, max_results=50):
        """Search vocabulary by pattern."""
        print(f"\nüîç Searching for words matching '{pattern}':")
        print("="*60)
        
        vocab = self.model.wv.index_to_key
        matches = [w for w in vocab if pattern.lower() in w.lower()]
        
        if not matches:
            print(f"‚ùå No matches found for '{pattern}'")
            return
        
        print(f"‚úÖ Found {len(matches)} matches (showing first {max_results}):\n")
        for i, word in enumerate(matches[:max_results], 1):
            print(f"  {i:3d}. {word}")
        
        if len(matches) > max_results:
            print(f"\nüí° {len(matches) - max_results} more matches not shown")
    
    def get_vocabulary_stats(self):
        """Get detailed vocabulary statistics."""
        vocab = self.model.wv.index_to_key
        lengths = [len(w) for w in vocab]
        
        print(f"\nüìä Vocabulary Statistics:")
        print("="*60)
        print(f"  Total words: {len(vocab):,}")
        print(f"  Shortest word: '{min(vocab, key=len)}' ({min(lengths)} chars)")
        print(f"  Longest word: '{max(vocab, key=len)}' ({max(lengths)} chars)")
        print(f"  Average length: {np.mean(lengths):.2f} chars")
        print(f"  Median length: {np.median(lengths):.0f} chars")
        
        # Character distribution
        first_chars = {}
        for word in vocab:
            if word:
                first = word[0].lower()
                first_chars[first] = first_chars.get(first, 0) + 1
        
        print(f"\nüìù Top 10 starting letters:")
        for char, count in sorted(first_chars.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  '{char}': {count} words ({100*count/len(vocab):.1f}%)")
    
    def show_word_vector(self, word):
        """Display the actual vector for a word."""
        if word not in self.model.wv:
            print(f"‚ùå '{word}' not in vocabulary!")
            return
        
        vector = self.model.wv[word]
        print(f"\nüî¢ Vector for '{word}':")
        print("="*60)
        print(f"  Dimensions: {len(vector)}")
        print(f"  First 10 values: {vector[:10]}")
        print(f"  Min: {vector.min():.4f}")
        print(f"  Max: {vector.max():.4f}")
        print(f"  Mean: {vector.mean():.4f}")
        print(f"  Std: {vector.std():.4f}")
    
    def evaluate_analogies(self, analogies):
        """
        Evaluate word analogies with detailed metrics.
        
        Args:
            analogies: List of tuples (a, b, c, expected)
                      Tests: a - b + c ‚âà expected
                      Example: ("king", "man", "queen", "woman")
        
        Returns:
            Tuple of (metrics dict, results list)
        """
        print(f"\nüìê Evaluating {len(analogies)} analogies...")
        print("="*60)
        
        correct = 0
        attempted = 0
        skipped = 0
        results = []
        
        for a, b, c, expected in analogies:
            missing = [w for w in [a, b, c, expected] if w not in self.model.wv]
            if missing:
                skipped += 1
                results.append({
                    'analogy': f"{a} - {b} + {c} = ?",
                    'expected': expected,
                    'predicted': None,
                    'correct': False,
                    'skipped': True
                })
                continue
            
            attempted += 1
            
            try:
                similar = self.model.wv.most_similar(positive=[c, a], negative=[b], topn=5)
                
                predicted = None
                for word, score in similar:
                    if word not in [a, b, c]:
                        predicted = word
                        break
                
                is_correct = (predicted == expected)
                if is_correct:
                    correct += 1
                
                results.append({
                    'analogy': f"{a} - {b} + {c}",
                    'expected': expected,
                    'predicted': predicted,
                    'correct': is_correct,
                    'skipped': False,
                    'candidates': similar[:3]
                })
                
            except Exception as e:
                skipped += 1
                results.append({
                    'analogy': f"{a} - {b} + {c}",
                    'expected': expected,
                    'predicted': None,
                    'correct': False,
                    'skipped': True
                })
        
        # Calculate metrics
        tp = correct
        fp = attempted - correct
        fn = attempted - correct
        
        metrics = calculate_metrics(tp, fp, fn)
        
        # Display results
        print(f"\nüìä Analogy Evaluation Results:")
        print("="*60)
        print(f"  Total analogies: {len(analogies)}")
        print(f"  Attempted: {attempted}")
        print(f"  Skipped (missing words): {skipped}")
        print(f"  Correct: {correct}")
        print(f"  Incorrect: {attempted - correct}")
        print(f"\n  Accuracy: {(correct/attempted*100) if attempted > 0 else 0:.2f}%")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1_score']:.4f}")
        
        print(f"\nüìù Sample Results (first 10):")
        print("="*60)
        for i, result in enumerate(results[:10], 1):
            if result['skipped']:
                print(f"  {i}. {result['analogy']} = {result['expected']} ‚ö†Ô∏è SKIPPED")
            else:
                status = "‚úÖ" if result['correct'] else "‚ùå"
                print(f"  {i}. {result['analogy']} = {result['expected']}")
                print(f"      Predicted: {result['predicted']} {status}")
        
        return metrics, results
    
    def evaluate_similarity(self, word_pairs):
        """
        Evaluate word similarity against human judgments.
        
        Args:
            word_pairs: List of tuples (word1, word2, human_score)
                       human_score: 0-1 or 0-10 (normalized automatically)
        
        Returns:
            Dictionary with correlation metrics and scores
        """
        print(f"\nüìè Evaluating {len(word_pairs)} word pairs...")
        print("="*60)
        
        model_scores = []
        human_scores = []
        skipped = 0
        
        for word1, word2, human_score in word_pairs:
            if word1 not in self.model.wv or word2 not in self.model.wv:
                skipped += 1
                continue
            
            model_sim = self.model.wv.similarity(word1, word2)
            
            # Normalize human score to 0-1 if needed
            if human_score > 1:
                human_score = human_score / 10.0
            
            model_scores.append(model_sim)
            human_scores.append(human_score)
        
        if len(model_scores) == 0:
            print("‚ùå No valid word pairs to evaluate!")
            return None
        
        # Calculate correlation
        spearman_corr, spearman_p = spearmanr(human_scores, model_scores)
        pearson_corr, pearson_p = pearsonr(human_scores, model_scores)
        
        # Binary classification metrics (threshold at 0.5)
        threshold = 0.5
        tp = sum(1 for h, m in zip(human_scores, model_scores) if h >= threshold and m >= threshold)
        fp = sum(1 for h, m in zip(human_scores, model_scores) if h < threshold and m >= threshold)
        fn = sum(1 for h, m in zip(human_scores, model_scores) if h >= threshold and m < threshold)
        tn = sum(1 for h, m in zip(human_scores, model_scores) if h < threshold and m < threshold)
        
        metrics = calculate_metrics(tp, fp, fn, tn)
        
        print(f"\nüìä Similarity Evaluation Results:")
        print("="*60)
        print(f"  Total pairs: {len(word_pairs)}")
        print(f"  Evaluated: {len(model_scores)}")
        print(f"  Skipped: {skipped}")
        print(f"\n  Spearman Correlation: {spearman_corr:.4f} (p={spearman_p:.4f})")
        print(f"  Pearson Correlation: {pearson_corr:.4f} (p={pearson_p:.4f})")
        print(f"\n  Binary Classification (threshold={threshold}):")
        print(f"    Accuracy: {metrics['accuracy']:.4f}")
        print(f"    Precision: {metrics['precision']:.4f}")
        print(f"    Recall: {metrics['recall']:.4f}")
        print(f"    F1-Score: {metrics['f1_score']:.4f}")
        
        return {
            'spearman': spearman_corr,
            'pearson': pearson_corr,
            'metrics': metrics,
            'model_scores': model_scores,
            'human_scores': human_scores
        }
    
    def evaluate_categorization(self, word_categories):
        """
        Evaluate word categorization task.
        
        Args:
            word_categories: Dict of category_name -> list of words
        
        Returns:
            Metrics dictionary
        """
        print(f"\nüè∑Ô∏è Evaluating categorization for {len(word_categories)} categories...")
        print("="*60)
        
        all_words = []
        true_labels = []
        
        for category, words in word_categories.items():
            for word in words:
                if word in self.model.wv:
                    all_words.append(word)
                    true_labels.append(category)
        
        if len(all_words) < 2:
            print("‚ùå Not enough valid words for categorization!")
            return None
        
        print(f"  Valid words: {len(all_words)}")
        
        correct = 0
        total = 0
        
        for i, word in enumerate(all_words):
            true_category = true_labels[i]
            similar = self.model.wv.most_similar(word, topn=len(all_words))
            
            for similar_word, score in similar:
                if similar_word in all_words and similar_word != word:
                    predicted_category = true_labels[all_words.index(similar_word)]
                    if predicted_category == true_category:
                        correct += 1
                    total += 1
                    break
        
        tp = correct
        fp = total - correct
        fn = total - correct
        
        metrics = calculate_metrics(tp, fp, fn)
        
        print(f"\nüìä Categorization Results:")
        print("="*60)
        print(f"  Accuracy: {(correct/total*100) if total > 0 else 0:.2f}%")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1_score']:.4f}")
        
        return metrics
    
    def interactive_analogy_test(self):
        """Interactive analogy testing with user input."""
        print("\nüìê Interactive Analogy Test")
        print("="*60)
        print("Format: word1 - word2 + word3 = ?")
        print("Example: king - man + woman = queen")
        print("Enter analogies one per line (empty line to finish)")
        print()
        
        analogies = []
        while True:
            line = input("  Analogy (format: a b c expected): ").strip()
            if not line:
                break
            
            parts = line.split()
            if len(parts) == 4:
                analogies.append(tuple(parts))
            else:
                print("  ‚ö†Ô∏è  Invalid format! Use: word1 word2 word3 expected")
        
        if analogies:
            self.evaluate_analogies(analogies)
        else:
            print("‚ùå No analogies entered!")
    
    def interactive_similarity_test(self):
        """Interactive similarity testing with user input."""
        print("\nüìè Interactive Similarity Test")
        print("="*60)
        print("Enter word pairs with human similarity scores")
        print("Format: word1 word2 score (score: 0-10 or 0-1)")
        print("Example: cat dog 8")
        print("Enter pairs one per line (empty line to finish)")
        print()
        
        word_pairs = []
        while True:
            line = input("  Pair: ").strip()
            if not line:
                break
            
            parts = line.split()
            if len(parts) == 3:
                try:
                    word1, word2, score = parts[0], parts[1], float(parts[2])
                    word_pairs.append((word1, word2, score))
                except ValueError:
                    print("  ‚ö†Ô∏è  Invalid score! Must be a number")
            else:
                print("  ‚ö†Ô∏è  Invalid format! Use: word1 word2 score")
        
        if word_pairs:
            self.evaluate_similarity(word_pairs)
        else:
            print("‚ùå No word pairs entered!")
    
    def interactive_categorization_test(self):
        """Interactive categorization testing with user input."""
        print("\nüè∑Ô∏è Interactive Categorization Test")
        print("="*60)
        print("Enter categories and their words")
        print("Format: category: word1 word2 word3")
        print("Example: animals: cat dog bird")
        print("Enter categories one per line (empty line to finish)")
        print()
        
        categories = {}
        while True:
            line = input("  Category: ").strip()
            if not line:
                break
            
            if ':' in line:
                cat_name, words = line.split(':', 1)
                cat_name = cat_name.strip()
                word_list = words.strip().split()
                if word_list:
                    categories[cat_name] = word_list
            else:
                print("  ‚ö†Ô∏è  Invalid format! Use: category: word1 word2 word3")
        
        if categories:
            self.evaluate_categorization(categories)
        else:
            print("‚ùå No categories entered!")
    
    def visualize_vocabulary_2d(self, words=None, num_words=50, highlight_words=None):
        """Visualize vocabulary in 2D space using FAST PCA."""
        if not HAS_MATPLOTLIB:
            print("‚ùå Matplotlib not installed! Cannot create visualization.")
            print("üí° Install with: pip install matplotlib")
            return
        
        if num_words > 200:
            print(f"‚ö†Ô∏è  Limiting to 200 words for speed (you requested {num_words})")
            num_words = 200
        
        print(f"\nüé® Generating 2D visualization using FAST PCA...")
        
        if words is None:
            words = self.model.wv.index_to_key[:num_words]
        else:
            words = [w for w in words if w in self.model.wv]
            if not words:
                print("‚ùå None of the specified words are in vocabulary!")
                return
            if len(words) > 200:
                print(f"‚ö†Ô∏è  Limiting to 200 words for speed")
                words = words[:200]
        
        print(f"   Processing {len(words)} words...")
        
        try:
            word_vectors = np.array([self.model.wv[word] for word in words])
            coords = manual_pca_2d(word_vectors)
            
            print(f"   Creating plot...")
            
            plt.figure(figsize=(14, 10))
            
            if highlight_words:
                colors = ['red' if w in highlight_words else 'blue' for w in words]
                sizes = [100 if w in highlight_words else 50 for w in words]
            else:
                colors = 'blue'
                sizes = 50
            
            plt.scatter(coords[:, 0], coords[:, 1], c=colors, alpha=0.6, s=sizes)
            
            label_step = max(1, len(words) // 100)
            for i in range(0, len(words), label_step):
                word = words[i]
                fontsize = 10 if (highlight_words and word in highlight_words) else 8
                fontweight = 'bold' if (highlight_words and word in highlight_words) else 'normal'
                plt.annotate(word, xy=(coords[i, 0], coords[i, 1]), 
                            xytext=(3, 3), textcoords='offset points',
                            fontsize=fontsize, alpha=0.8, fontweight=fontweight)
            
            plt.title(f'Word2Vec Vocabulary (FAST PCA) - {len(words)} words', 
                     fontsize=14, fontweight='bold')
            plt.xlabel('Principal Component 1')
            plt.ylabel('Principal Component 2')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            
            filename = f'vocab_visualization_pca_{len(words)}words.png'
            plt.savefig(filename, dpi=150, bbox_inches='tight')
            print(f"‚úÖ Saved to {filename}")
            plt.show()
        
        except Exception as e:
            print(f"‚ùå Error creating visualization: {e}")
    
    def visualize_word_neighbors(self, word, topn=20):
        """Visualize a word and its nearest neighbors in 2D space."""
        if not HAS_MATPLOTLIB:
            print("‚ùå Matplotlib not installed! Showing text version instead...\n")
            if word not in self.model.wv:
                print(f"‚ùå '{word}' not in vocabulary!")
                return
            
            similar = self.model.wv.most_similar(word, topn=topn)
            print(f"üìù '{word}' and its {topn} nearest neighbors:")
            print("="*60)
            print(f"  TARGET: {word}")
            for i, (w, score) in enumerate(similar, 1):
                bar_length = int(score * 40)
                print(f"  {i:2d}. {w:15s} [{('‚ñà' * bar_length):40s}] {score:.4f}")
            return
        
        if word not in self.model.wv:
            print(f"‚ùå '{word}' not in vocabulary!")
            return
        
        print(f"\nüéØ Visualizing '{word}' and its {topn} nearest neighbors...")
        
        try:
            similar = self.model.wv.most_similar(word, topn=topn)
            neighbor_words = [w for w, _ in similar]
            all_words = [word] + neighbor_words
            
            self.visualize_vocabulary_2d(
                words=all_words,
                highlight_words=[word]
            )
            
            print(f"\nüìä Similarity scores for '{word}':")
            for w, score in similar[:10]:
                print(f"  ‚Ä¢ {w}: {score:.4f}")
        
        except Exception as e:
            print(f"‚ùå Error visualizing neighbors: {e}")
    
    def visualize_word_clusters(self, word_groups):
        """Visualize multiple groups of related words with different colors."""
        if not HAS_MATPLOTLIB:
            print("‚ùå Matplotlib not installed! Showing text version instead...\n")
            if isinstance(word_groups, dict):
                for group_name, words in word_groups.items():
                    valid_words = [w for w in words if w in self.model.wv]
                    print(f"\nüì¶ Group: {group_name} ({len(valid_words)} words)")
                    print("="*60)
                    for word in valid_words:
                        print(f"  ‚Ä¢ {word}")
            return
        
        print(f"\nüé® Visualizing word clusters...")
        
        try:
            if isinstance(word_groups, dict):
                all_words = []
                colors = []
                color_map = plt.cm.get_cmap('tab10')
                group_info = []
                
                for i, (group_name, words) in enumerate(word_groups.items()):
                    valid_words = [w for w in words if w in self.model.wv]
                    all_words.extend(valid_words)
                    colors.extend([color_map(i)] * len(valid_words))
                    group_info.append((group_name, len(valid_words), color_map(i)))
                    print(f"  Group '{group_name}': {len(valid_words)} words")
            else:
                all_words = []
                colors = []
                color_map = plt.cm.get_cmap('tab10')
                group_info = []
                
                for i, words in enumerate(word_groups):
                    valid_words = [w for w in words if w in self.model.wv]
                    all_words.extend(valid_words)
                    colors.extend([color_map(i)] * len(valid_words))
            
            if not all_words:
                print("‚ùå No valid words found!")
                return
            
            if len(all_words) > 200:
                print(f"‚ö†Ô∏è  Limiting to 200 words for speed")
                all_words = all_words[:200]
                colors = colors[:200]
            
            print(f"   Processing {len(all_words)} words...")
            
            word_vectors = np.array([self.model.wv[word] for word in all_words])
            coords = manual_pca_2d(word_vectors)
            
            print(f"   Creating plot...")
            
            plt.figure(figsize=(14, 10))
            plt.scatter(coords[:, 0], coords[:, 1], c=colors, alpha=0.6, s=100)
            
            label_step = max(1, len(all_words) // 80)
            for i in range(0, len(all_words), label_step):
                word = all_words[i]
                plt.annotate(word, xy=(coords[i, 0], coords[i, 1]),
                            xytext=(3, 3), textcoords='offset points',
                            fontsize=9, alpha=0.8, fontweight='bold')
            
            if isinstance(word_groups, dict) and group_info:
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=color, label=f'{name} ({count})')
                       for name, count, color in group_info]
    plt.legend(handles=legend_elements, loc='best')

SyntaxError: incomplete input (1592693646.py, line 763)