In [10]:
!pip install gensim



In [11]:
!pip install fasttext



In [12]:
import nltk
from nltk.corpus import stopwords
import os

nltk.download('stopwords')
french_stop_words = set(stopwords.words('french'))

french_stop_words.update(
    ['.', ',', '(', ')', '\'', '-', ';', ':', '</s>', '/', '\\', '\',','d\''])


def remove_stopwords_from_file(input_path, output_path):
    print(f"Processing {input_path}...")

    if not os.path.exists(input_path):
        print(f"❌ File not found: {input_path}")
        return

    with open(input_path, 'r', encoding='utf-8') as fin, \
            open(output_path, 'w', encoding='utf-8') as fout:

        for line in fin:
            # Split line into words (tokens)
            tokens = line.strip().split()

            # Keep only words NOT in the stop word list
            filtered_tokens = [
                t for t in tokens if t.lower() not in french_stop_words]

            # Write back to new file if the line is not empty
            if filtered_tokens:
                fout.write(" ".join(filtered_tokens) + "\n")

    print(f"✅ Created cleaned file: {output_path}")



base_dir = 'TP_ISD2020'
med_original = os.path.join(
    base_dir, 'QUAERO_FrenchMed', 'QUAERO_FrenchMed_traindev.ospl')
press_original = os.path.join(
    base_dir, 'QUAERO_FrenchPress', 'QUAERO_FrenchPress_traindev.ospl')

med_cleaned = "QUAERO_FrenchMed_cleaned.ospl"
press_cleaned = "QUAERO_FrenchPress_cleaned.ospl"

remove_stopwords_from_file(med_original, med_cleaned)
remove_stopwords_from_file(press_original, press_cleaned)

Processing TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl...


✅ Created cleaned file: QUAERO_FrenchMed_cleaned.ospl
Processing TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/idrissamahamoudoudicko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Created cleaned file: QUAERO_FrenchPress_cleaned.ospl


In [13]:
import logging
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

logging.basicConfig(format='%(asctime)s : %(levelname)s:%(message)s', level=logging.INFO)
corpus_med = "QUAERO_FrenchMed_cleaned.ospl"
corpus_press = "QUAERO_FrenchPress_cleaned.ospl"

DIM = 100
MIN_COUNT = 1
def train_w2v(corpus_file, model_type, name_suffix):
    print(f"--- Training {name_suffix} ({'Skkipgram' if model_type == 1 else 'CBOW'})")
    model = Word2Vec(sentences=LineSentence(corpus_file),
                     vector_size=DIM,
                     min_count = MIN_COUNT,
                     sg = model_type,
                     workers = 4)
    filename = f"w2v_{name_suffix}_{'sg' if model_type==1 else 'cbow'}.model"
    model.save(filename)
    print(f'saved:{filename}')

In [14]:

train_w2v(corpus_med, 0, "med")  # CBOW
train_w2v(corpus_med, 1, "med")  # Skipgram

2026-02-03 14:42:10,172 : INFO:collecting all words and their counts
2026-02-03 14:42:10,175 : INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-02-03 14:42:10,252 : INFO:collected 8937 word types from a corpus of 28225 raw words and 3021 sentences
2026-02-03 14:42:10,252 : INFO:Creating a fresh vocabulary
2026-02-03 14:42:10,283 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 8937 unique words (100.00% of original 8937, drops 0)', 'datetime': '2026-02-03T14:42:10.282995', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-02-03 14:42:10,283 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 28225 word corpus (100.00% of original 28225, drops 0)', 'datetime': '2026-02-03T14:42:10.283698', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:5

--- Training med (CBOW)


2026-02-03 14:42:10,389 : INFO:estimated required memory for 8937 words and 100 dimensions: 11618100 bytes
2026-02-03 14:42:10,390 : INFO:resetting layer weights
2026-02-03 14:42:10,400 : INFO:Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2026-02-03T14:42:10.400193', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'build_vocab'}
2026-02-03 14:42:10,400 : INFO:Word2Vec lifecycle event {'msg': 'training model with 4 workers on 8937 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2026-02-03T14:42:10.400893', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'train'}
2026-02-03 14:42:10,444 : INFO:EPOCH 0: training on 28225 raw words (26731 effective words

saved:w2v_med_cbow.model
--- Training med (Skkipgram)


2026-02-03 14:42:10,863 : INFO:EPOCH 1: training on 28225 raw words (26757 effective words) took 0.1s, 451854 effective words/s
2026-02-03 14:42:10,928 : INFO:EPOCH 2: training on 28225 raw words (26721 effective words) took 0.1s, 423861 effective words/s
2026-02-03 14:42:11,005 : INFO:EPOCH 3: training on 28225 raw words (26700 effective words) took 0.1s, 356348 effective words/s
2026-02-03 14:42:11,088 : INFO:EPOCH 4: training on 28225 raw words (26706 effective words) took 0.1s, 328797 effective words/s
2026-02-03 14:42:11,089 : INFO:Word2Vec lifecycle event {'msg': 'training on 141125 raw words (133615 effective words) took 0.3s, 384310 effective words/s', 'datetime': '2026-02-03T14:42:11.089226', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'train'}
2026-02-03 14:42:11,089 : INFO:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=8937, vector_size=100, alp

saved:w2v_med_sg.model


In [15]:
train_w2v(corpus_press, 0, "press")  # CBOW
train_w2v(corpus_press, 1, "press")  # Skipgram

2026-02-03 14:42:11,120 : INFO:collecting all words and their counts
2026-02-03 14:42:11,123 : INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-02-03 14:42:11,198 : INFO:PROGRESS: at sentence #10000, processed 172881 words, keeping 19801 word types
2026-02-03 14:42:11,253 : INFO:PROGRESS: at sentence #20000, processed 328008 words, keeping 28641 word types
2026-02-03 14:42:11,310 : INFO:PROGRESS: at sentence #30000, processed 501568 words, keeping 34911 word types


--- Training press (CBOW)


2026-02-03 14:42:11,363 : INFO:collected 39468 word types from a corpus of 678774 raw words and 38546 sentences
2026-02-03 14:42:11,364 : INFO:Creating a fresh vocabulary
2026-02-03 14:42:11,439 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 39468 unique words (100.00% of original 39468, drops 0)', 'datetime': '2026-02-03T14:42:11.439767', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-02-03 14:42:11,440 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 678774 word corpus (100.00% of original 678774, drops 0)', 'datetime': '2026-02-03T14:42:11.440237', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-02-03 14:42:11,570 : INFO:deleting the raw counts dictiona

saved:w2v_press_cbow.model
--- Training press (Skkipgram)


2026-02-03 14:42:15,884 : INFO:collected 39468 word types from a corpus of 678774 raw words and 38546 sentences
2026-02-03 14:42:15,885 : INFO:Creating a fresh vocabulary
2026-02-03 14:42:15,965 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 39468 unique words (100.00% of original 39468, drops 0)', 'datetime': '2026-02-03T14:42:15.965602', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-02-03 14:42:15,966 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 678774 word corpus (100.00% of original 678774, drops 0)', 'datetime': '2026-02-03T14:42:15.966267', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-02-03 14:42:16,177 : INFO:deleting the raw counts dictiona

saved:w2v_press_sg.model


In [16]:
import fasttext
def train_ft(corpus_file, name_suffix):
    print(f"-- Training FastTest CBOW on {name_suffix}")

    model = fasttext.train_unsupervised(corpus_file, model='cbow',dim=1, minCount=1)
    filename_bin = f'ft_{name_suffix}_cbow.bin'
    model.save_model(filename_bin)
    print(f"Saved: {filename_bin}")

train_ft(corpus_med, "med")
train_ft(corpus_press, "press")

Read 0M words
Number of words:  8938
Number of labels: 0


-- Training FastTest CBOW on med


Progress: 100.0% words/sec/thread:  212181 lr:  0.000000 avg.loss:  3.736136 ETA:   0h 0m 0s
Read 0M words
Number of words:  39469
Number of labels: 0


Saved: ft_med_cbow.bin
-- Training FastTest CBOW on press


Progress:  97.2% words/sec/thread:  333455 lr:  0.001392 avg.loss:  2.615353 ETA:   0h 0m 0s

Saved: ft_press_cbow.bin


Progress: 100.0% words/sec/thread:  320630 lr:  0.000000 avg.loss:  2.609917 ETA:   0h 0m 0s


In [17]:
from gensim.models import KeyedVectors
import pandas as pd

words = ['patient', 'traitement', 'maladie', 'solution', 'jaune']

def get_neighbors(model, word, approach_name):
    try:
        if hasattr(model, 'wv'):
            neighbors = model.wv.most_similar(word, topn=5)
        elif hasattr(model, 'get_nearest_neighbors'):
            neighbors = [(w,s) for s, w in model.get_nearest_neighbors(word, k=5)]
        else:
            return "Error"
    except KeyError:
        return "Word not found"

In [None]:
import pandas as pd
from gensim.models import Word2Vec
import fasttext

# Load Models (Ensure these variables are active in memory)
w2v_med_cbow = Word2Vec.load("w2v_med_cbow.model")
w2v_med_sg = Word2Vec.load("w2v_med_sg.model")
w2v_press_cbow = Word2Vec.load("w2v_press_cbow.model")
w2v_press_sg = Word2Vec.load("w2v_press_sg.model")

# Load FastText (Re-loading to be safe)
ft_med = fasttext.load_model("ft_med_cbow.bin")
ft_press = fasttext.load_model("ft_press_cbow.bin")


def get_neighbors_safe(model, word):
    """
    Robust function to get neighbors.
    Returns the error message if something fails.
    """
    try:
        # GENSIM (Word2Vec)
        if hasattr(model, 'wv'):
            # Check if word is in vocab first
            if word not in model.wv.key_to_index:
                return "Not in Vocab"
            neighbors = model.wv.most_similar(word, topn=5)
            return ", ".join([n[0] for n in neighbors])

        # FASTTEXT (Library object)
        elif hasattr(model, 'get_nearest_neighbors'):
            neighbors = model.get_nearest_neighbors(word, k=5)
            # FastText returns (score, word), we want just the word
            return ", ".join([n[1] for n in neighbors])

        else:
            return "Unknown Model Type"

    except Exception as e:
        return f"Error: {str(e)}"


# Run Comparison
words = ['patient', 'traitement', 'maladie', 'solution', 'jaune']
results = []

for word in words:
    entry = {"Word": word}
    entry["Med (W2V-CBOW)"] = get_neighbors_safe(w2v_med_cbow, word)
    entry["Med (W2V-SG)"] = get_neighbors_safe(w2v_med_sg, word)
    entry["Med (FT)"] = get_neighbors_safe(ft_med, word)
    entry["Press (W2V-CBOW)"] = get_neighbors_safe(w2v_press_cbow, word)
    entry["Press (FT)"] = get_neighbors_safe(ft_press, word)
    results.append(entry)

df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)

try:
    display(df)
except:
    print(df)

2026-02-03 14:42:36,315 : INFO:loading Word2Vec object from w2v_med_cbow.model
2026-02-03 14:42:36,343 : INFO:loading wv recursively from w2v_med_cbow.model.wv.* with mmap=None
2026-02-03 14:42:36,344 : INFO:setting ignored attribute cum_table to None
2026-02-03 14:42:36,460 : INFO:Word2Vec lifecycle event {'fname': 'w2v_med_cbow.model', 'datetime': '2026-02-03T14:42:36.460161', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'loaded'}
2026-02-03 14:42:36,470 : INFO:loading Word2Vec object from w2v_med_sg.model
2026-02-03 14:42:36,477 : INFO:loading wv recursively from w2v_med_sg.model.wv.* with mmap=None
2026-02-03 14:42:36,478 : INFO:setting ignored attribute cum_table to None
2026-02-03 14:42:36,543 : INFO:Word2Vec lifecycle event {'fname': 'w2v_med_sg.model', 'datetime': '2026-02-03T14:42:36.543589', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forg

Unnamed: 0,Word,Med (W2V-CBOW),Med (W2V-SG),Med (FT),Press (W2V-CBOW),Press (FT)
0,patient,"a, traitement, ’, chez, ""","symptômes, médicament, si, maladie, risque","chez, cas, bénéfique, reconnaissance, Plaidoyer","régulièrement, établissements, manquent, médecin, comédiens","a, c', hilarité, noeuds, anticycloniques"
1,traitement,"’, chez, être, médecin, a","médecin, TYSABRI, peut, devra, médicaments","chez, être, bénéfique, reconnaissance, Plaidoyer","ressources, techniques, nucléaire, activités, professionnelle","a, c', hilarité, noeuds, anticycloniques"
2,maladie,"’, chez, médecin, patients, traitement","étude, cours, clinique, symptômes, •","chez, cas, bénéfique, reconnaissance, Plaidoyer","population, difficultés, convaincre, acceptable, gagner","a, c', hilarité, noeuds, anticycloniques"
3,solution,"mg, perfusion, 1, ’, sous","flacon, contient, 20, mg, 100","chez, cas, bénéfique, reconnaissance, Plaidoyer","liberté, défi, légitimité, responsabilité, désir","a, c', hilarité, noeuds, anticycloniques"
4,jaune,"manifester, fréquences, propriétés, immuns, Dieu","entre, tolcapone, sodium, Etude, rapport","chez, cas, bénéfique, reconnaissance, Plaidoyer","chauffeurs, rock, Sébastien, Gabriel, Lars","a, c', hilarité, noeuds, anticycloniques"
