In [3]:
import numpy as np
from gensim.models import KeyedVectors
from loguru import logger

# 1. Load your French FastText model in .vec format
fasttext_path = 'backend/data/cc.fr.300.reduced.vec'
logger.info(f"Loading FastText model from '{fasttext_path}'...")
model = KeyedVectors.load_word2vec_format(fasttext_path, binary=False)

logger.info(f"Model loaded successfully with vocabulary size: {len(model.index_to_key)}\n")

# 2. Define a helper function to create a concept vector
def create_concept_vector(
    positive_words: list, 
    negative_words: list, 
    embedding_model: KeyedVectors
) -> np.ndarray:
    """
    Creates a concept vector by summing the vectors of all
    positive words and subtracting the vectors of all negative words.
    
    Args:
        positive_words: List of words to add
        negative_words: List of words to subtract
        embedding_model: A Gensim KeyedVectors model with FastText vectors
    
    Returns:
        A NumPy array representing the new concept vector
    """
    try:
        concept_vec = np.zeros(embedding_model.vector_size, dtype=np.float32)
        
        # Process positive words
        for word in positive_words:
            w = word.lower()
            if w in embedding_model:
                concept_vec += embedding_model[w]
                logger.debug(f"Added vector for positive word: {word}")
            else:
                logger.warning(f"Word not found in vocabulary: {word}")

        # Process negative words
        for word in negative_words:
            w = word.lower()
            if w in embedding_model:
                concept_vec -= embedding_model[w]
                logger.debug(f"Subtracted vector for negative word: {word}")
            else:
                logger.warning(f"Word not found in vocabulary: {word}")

        return concept_vec

    except Exception as e:
        logger.exception(f"Error creating concept vector: {e}")
        return np.zeros(embedding_model.vector_size, dtype=np.float32)

# 3. Class to manage concept vectors
class ConceptVectorManager:
    def __init__(self, model: KeyedVectors):
        self.model = model
        self.concept_vectors = {}
        logger.info("ConceptVectorManager initialized")

    def create_concept(
        self,
        name: str,
        positive_words: list,
        negative_words: list = None
    ) -> bool:
        """
        Creates and stores a new concept vector.
        
        Args:
            name: Name of the concept
            positive_words: List of words to add
            negative_words: List of words to subtract (optional)
        
        Returns:
            bool: True if concept was created successfully
        """
        try:
            negative_words = negative_words or []
            concept_vec = create_concept_vector(
                positive_words=positive_words,
                negative_words=negative_words,
                embedding_model=self.model
            )
            
            self.concept_vectors[name] = concept_vec
            logger.info(f"Created concept vector: {name}")
            return True
            
        except Exception as e:
            logger.exception(f"Error creating concept {name}: {e}")
            return False

    def get_similar_words(self, concept_name: str, topn: int = 10) -> list:
        """
        Find words most similar to a stored concept vector.
        
        Args:
            concept_name: Name of the stored concept
            topn: Number of similar words to return
        
        Returns:
            List of (word, similarity) tuples
        """
        try:
            if concept_name not in self.concept_vectors:
                logger.error(f"Concept not found: {concept_name}")
                return []
                
            results = self.model.similar_by_vector(
                self.concept_vectors[concept_name], 
                topn=topn
            )
            logger.info(f"Found {len(results)} similar words for concept: {concept_name}")
            return results
            
        except Exception as e:
            logger.exception(f"Error finding similar words for concept {concept_name}: {e}")
            return []

# 4. Example usage
if __name__ == "__main__":
    # Initialize the manager
    manager = ConceptVectorManager(model)
    
    # Example: "roi - homme + femme = reine"
    manager.create_concept(
        name="royalty_female",
        positive_words=["roi", "femme"],
        negative_words=["homme"]
    )
    
    # Find similar words
    results = manager.get_similar_words("royalty_female", topn=10)
    
    print("\nTop 10 words closest to the 'royalty_female' concept:\n")
    for word, sim in results:
        print(f"{word} (similarity: {sim:.4f})")

    # Example: Create a concept for "Paris without tourism"
    manager.create_concept(
        name="paris_local",
        positive_words=["paris", "quotidien", "habitant"],
        negative_words=["touriste", "tourisme"]
    )
    
    results = manager.get_similar_words("paris_local", topn=10)
    
    print("\nTop 10 words closest to the 'paris_local' concept:\n")
    for word, sim in results:
        print(f"{word} (similarity: {sim:.4f})")

[32m2025-01-31 11:21:29.430[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoading FastText model from 'backend/data/cc.fr.300.reduced.vec'...[0m


[32m2025-01-31 11:22:03.465[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mModel loaded successfully with vocabulary size: 50000
[0m
[32m2025-01-31 11:22:03.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m62[0m - [1mConceptVectorManager initialized[0m
[32m2025-01-31 11:22:03.479[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcreate_concept_vector[0m:[36m38[0m - [34m[1mAdded vector for positive word: roi[0m
[32m2025-01-31 11:22:03.480[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcreate_concept_vector[0m:[36m38[0m - [34m[1mAdded vector for positive word: femme[0m
[32m2025-01-31 11:22:03.481[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcreate_concept_vector[0m:[36m47[0m - [34m[1mSubtracted vector for negative word: homme[0m
[32m2025-01-31 11:22:03.485[0m | [1mINFO    [0m | [36m__main__[0m:[36mcreate_concept[0m:[36m90[0m - [1mCreated concept vector: royalty_female[0m
[32m2025


Top 10 words closest to the 'royalty_female' concept:

roi (similarity: 0.9444)
Roi (similarity: 0.7885)
reine (similarity: 0.7228)
monarque (similarity: 0.6957)
prince (similarity: 0.6507)
royaume (similarity: 0.6344)
souverain (similarity: 0.6296)
rois (similarity: 0.6225)
princesse (similarity: 0.5896)
duc (similarity: 0.5885)

Top 10 words closest to the 'paris_local' concept:

paris (similarity: 0.6702)
Paris (similarity: 0.4485)
PARIS (similarity: 0.4406)
lyon (similarity: 0.4270)
Villejuif (similarity: 0.4055)
toulouse (similarity: 0.4046)
marseille (similarity: 0.3937)
Arcueil (similarity: 0.3911)
Aubervilliers (similarity: 0.3827)
Vanves (similarity: 0.3791)


In [2]:
from gensim.models import KeyedVectors
import numpy as np
from tqdm import tqdm

def reduce_fasttext_model(input_path: str, output_path: str, n_words: int = 50000):
    """
    Create a reduced version of the FastText model keeping only the most frequent words.
    
    Args:
        input_path: Path to the original .vec file
        output_path: Where to save the reduced model
        n_words: Number of words to keep
    """
    print(f"Loading original model from {input_path}")
    model = KeyedVectors.load_word2vec_format(input_path)
    
    # Get the vocabulary size and vector dimension
    vocab_size = len(model.index_to_key)
    vector_size = model.vector_size
    
    print(f"Original model: {vocab_size} words, {vector_size} dimensions")
    
    # Keep only the first n_words (they're already sorted by frequency in FastText)
    reduced_words = model.index_to_key[:n_words]
    
    # Write the reduced model in word2vec format
    print(f"Writing reduced model to {output_path}")
    with open(output_path, 'w', encoding='utf-8') as f:
        # Header: number of words and vector dimension
        f.write(f"{n_words} {vector_size}\n")
        
        # Write each word and its vector
        for word in tqdm(reduced_words, desc="Writing vectors"):
            vector = model[word]
            vector_str = ' '.join(f"{x:.6f}" for x in vector)
            f.write(f"{word} {vector_str}\n")
    
    print(f"Created reduced model with {n_words} words")

if __name__ == "__main__":
    reduce_fasttext_model(
        input_path="backend/data/cc.fr.300.vec",
        output_path="backend/data/cc.fr.300.reduced.vec",
        n_words=50000
    )

Loading original model from backend/data/cc.fr.300.vec
Original model: 2000000 words, 300 dimensions
Writing reduced model to backend/data/cc.fr.300.reduced.vec


Writing vectors: 100%|██████████| 50000/50000 [00:37<00:00, 1321.33it/s]


Created reduced model with 50000 words


In [3]:
import os
from pathlib import Path
import sys
from typing import List, Set

def print_directory_structure(startpath: str, exclude_dirs: Set[str] = None) -> None:
    """
    Print the directory structure starting from the specified path.
    
    Args:
        startpath: The root directory to start from
        exclude_dirs: Set of directory names to exclude
    """
    if exclude_dirs is None:
        exclude_dirs = {'.git', '__pycache__', 'node_modules', 'env', 'venv'}
    
    prefix = '│   '
    for root, dirs, files in os.walk(startpath):
        # Skip excluded directories
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        
        level = root.replace(startpath, '').count(os.sep)
        indent = '│   ' * level
        
        folder_name = os.path.basename(root)
        print(f'{indent}├── {folder_name}/')
        
        sub_indent = '│   ' * (level + 1)
        for file in sorted(files):
            if not file.startswith('.'):  # Skip hidden files
                print(f'{sub_indent}├── {file}')

if __name__ == "__main__":
    # Get the current directory or use command line argument
    current_dir = os.getcwd()
    
    print("\nProject Structure:")
    print("================")
    print_directory_structure(current_dir)
    print("\nNote: Excluded directories: .git, __pycache__, node_modules, env, venv")



Project Structure:
├── semantix_like/
│   ├── LICENSE
│   ├── README.md
│   ├── description.md
│   ├── docker-compose.yml
│   ├── prompt.md
│   ├── requirements.txt
│   ├── todo
│   ├── vercel.json
│   ├── work.ipynb
│   ├── backend/
│   │   ├── 0.99)
│   │   ├── Dict
│   │   ├── Dockerfile
│   │   ├── List[Dict]
│   │   ├── None
│   │   ├── app.log
│   │   ├── app.py
│   │   ├── requirements.txt
│   │   ├── routes.py
│   │   ├── str
│   │   ├── test_config.py
│   │   ├── config/
│   │   │   ├── game_config.py
│   │   ├── data/
│   │   │   ├── cc.fr.300.reduced.vec
│   │   │   ├── game_state.json
│   │   │   ├── word_list.json
│   │   ├── services/
│   │   │   ├── game_service.py
│   │   │   ├── model_downloader.py
│   │   │   ├── visualization_service.py
│   │   │   ├── word_service.py
│   ├── frontend/
│   │   ├── Dockerfile
│   │   ├── index.html
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── tailwind.config.js
│   │   ├── src/
│   │   │   ├── main.ts
│   │   │ 

In [8]:
import requests
import json
from pprint import pprint

# Base URL for your Hugging Face Space
BASE_URL = "https://miroir-semantix-api.hf.space"

def test_endpoints():
    """Test all basic endpoints and print results"""
    endpoints = [
        "/api/test/ping",
        "/api/test/model",
        "/api/test/env",
        "/api/test/model-info"
    ]
    
    results = {}
    for endpoint in endpoints:
        print(f"\nTesting {endpoint}...")
        try:
            response = requests.get(f"{BASE_URL}{endpoint}")
            print(f"Status Code: {response.status_code}")
            if response.status_code == 200:
                pprint(response.json())
                results[endpoint] = "OK"
            else:
                print(f"Error: {response.text}")
                results[endpoint] = "FAILED"
        except Exception as e:
            print(f"Error: {str(e)}")
            results[endpoint] = f"ERROR: {str(e)}"
    
    print("\nSummary:")
    for endpoint, status in results.items():
        print(f"{endpoint}: {status}")

# Run the tests
test_endpoints()


Testing /api/test/ping...
Status Code: 200
{'message': 'pong', 'status': 'ok'}

Testing /api/test/model...
Status Code: 200
{'message': 'Model is working',
 'status': 'ok',
 'test_similarity': {'similarity': 0.5600714683532715,
                     'word1': 'bonjour',
                     'word2': 'salut'}}

Testing /api/test/env...
Status Code: 200
{'environment': {'host': 'r-miroir-semantix-api-lwiuyzic-9b956-nvd2y',
                 'model_url': 'https://huggingface.co/Miroir/cc.fr.300.reduced/resolve/main/cc.fr.300.reduced.vec',
                 'python_version': '3.11.11',
                 'services_initialized': {'game_service': True,
                                          'visualization_service': True,
                                          'word_service': True}},
 'status': 'ok'}

Testing /api/test/model-info...
Status Code: 200
{'model_info': {'sample_words': [',', 'de', '.', '</s>', 'la'],
                'vocabulary_size': 50000},
 'status': 'ok'}

Summary:
/api/test/