In [2]:
import numpy as np
from gensim.models import KeyedVectors
from loguru import logger

# 1. Load your French FastText model in .vec format
fasttext_path = 'backend/data/cc.fr.300.vec'
logger.info(f"Loading FastText model from '{fasttext_path}'...")
model = KeyedVectors.load_word2vec_format(fasttext_path, binary=False)

logger.info(f"Model loaded successfully with vocabulary size: {len(model.index_to_key)}\n")

# 2. Define a helper function to create a concept vector
def create_concept_vector(
    positive_words: list, 
    negative_words: list, 
    embedding_model: KeyedVectors
) -> np.ndarray:
    """
    Creates a concept vector by summing the vectors of all
    positive words and subtracting the vectors of all negative words.
    
    Args:
        positive_words: List of words to add
        negative_words: List of words to subtract
        embedding_model: A Gensim KeyedVectors model with FastText vectors
    
    Returns:
        A NumPy array representing the new concept vector
    """
    try:
        concept_vec = np.zeros(embedding_model.vector_size, dtype=np.float32)
        
        # Process positive words
        for word in positive_words:
            w = word.lower()
            if w in embedding_model:
                concept_vec += embedding_model[w]
                logger.debug(f"Added vector for positive word: {word}")
            else:
                logger.warning(f"Word not found in vocabulary: {word}")

        # Process negative words
        for word in negative_words:
            w = word.lower()
            if w in embedding_model:
                concept_vec -= embedding_model[w]
                logger.debug(f"Subtracted vector for negative word: {word}")
            else:
                logger.warning(f"Word not found in vocabulary: {word}")

        return concept_vec

    except Exception as e:
        logger.exception(f"Error creating concept vector: {e}")
        return np.zeros(embedding_model.vector_size, dtype=np.float32)

# 3. Class to manage concept vectors
class ConceptVectorManager:
    def __init__(self, model: KeyedVectors):
        self.model = model
        self.concept_vectors = {}
        logger.info("ConceptVectorManager initialized")

    def create_concept(
        self,
        name: str,
        positive_words: list,
        negative_words: list = None
    ) -> bool:
        """
        Creates and stores a new concept vector.
        
        Args:
            name: Name of the concept
            positive_words: List of words to add
            negative_words: List of words to subtract (optional)
        
        Returns:
            bool: True if concept was created successfully
        """
        try:
            negative_words = negative_words or []
            concept_vec = create_concept_vector(
                positive_words=positive_words,
                negative_words=negative_words,
                embedding_model=self.model
            )
            
            self.concept_vectors[name] = concept_vec
            logger.info(f"Created concept vector: {name}")
            return True
            
        except Exception as e:
            logger.exception(f"Error creating concept {name}: {e}")
            return False

    def get_similar_words(self, concept_name: str, topn: int = 10) -> list:
        """
        Find words most similar to a stored concept vector.
        
        Args:
            concept_name: Name of the stored concept
            topn: Number of similar words to return
        
        Returns:
            List of (word, similarity) tuples
        """
        try:
            if concept_name not in self.concept_vectors:
                logger.error(f"Concept not found: {concept_name}")
                return []
                
            results = self.model.similar_by_vector(
                self.concept_vectors[concept_name], 
                topn=topn
            )
            logger.info(f"Found {len(results)} similar words for concept: {concept_name}")
            return results
            
        except Exception as e:
            logger.exception(f"Error finding similar words for concept {concept_name}: {e}")
            return []

# 4. Example usage
if __name__ == "__main__":
    # Initialize the manager
    manager = ConceptVectorManager(model)
    
    # Example: "roi - homme + femme = reine"
    manager.create_concept(
        name="royalty_female",
        positive_words=["roi", "femme"],
        negative_words=["homme"]
    )
    
    # Find similar words
    results = manager.get_similar_words("royalty_female", topn=10)
    
    print("\nTop 10 words closest to the 'royalty_female' concept:\n")
    for word, sim in results:
        print(f"{word} (similarity: {sim:.4f})")

    # Example: Create a concept for "Paris without tourism"
    manager.create_concept(
        name="paris_local",
        positive_words=["paris", "quotidien", "habitant"],
        negative_words=["touriste", "tourisme"]
    )
    
    results = manager.get_similar_words("paris_local", topn=10)
    
    print("\nTop 10 words closest to the 'paris_local' concept:\n")
    for word, sim in results:
        print(f"{word} (similarity: {sim:.4f})")

[32m2025-01-31 10:17:52.602[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoading FastText model from 'backend/data/cc.fr.300.vec'...[0m


KeyboardInterrupt: 

In [None]:
from gensim.models import KeyedVectors
import numpy as np
from tqdm import tqdm

def reduce_fasttext_model(input_path: str, output_path: str, n_words: int = 50000):
    """
    Create a reduced version of the FastText model keeping only the most frequent words.
    
    Args:
        input_path: Path to the original .vec file
        output_path: Where to save the reduced model
        n_words: Number of words to keep
    """
    print(f"Loading original model from {input_path}")
    model = KeyedVectors.load_word2vec_format(input_path)
    
    # Get the vocabulary size and vector dimension
    vocab_size = len(model.index_to_key)
    vector_size = model.vector_size
    
    print(f"Original model: {vocab_size} words, {vector_size} dimensions")
    
    # Keep only the first n_words (they're already sorted by frequency in FastText)
    reduced_words = model.index_to_key[:n_words]
    
    # Write the reduced model in word2vec format
    print(f"Writing reduced model to {output_path}")
    with open(output_path, 'w', encoding='utf-8') as f:
        # Header: number of words and vector dimension
        f.write(f"{n_words} {vector_size}\n")
        
        # Write each word and its vector
        for word in tqdm(reduced_words, desc="Writing vectors"):
            vector = model[word]
            vector_str = ' '.join(f"{x:.6f}" for x in vector)
            f.write(f"{word} {vector_str}\n")
    
    print(f"Created reduced model with {n_words} words")

if __name__ == "__main__":
    reduce_fasttext_model(
        input_path="backend/data/cc.fr.300.vec",
        output_path="data/cc.fr.300.reduced.vec",
        n_words=50000
    )

Loading original model from backend/data/cc.fr.300.vec
