In [146]:
!pip install gensim



In [147]:
!pip install fasttext



In [148]:
import nltk
from nltk.corpus import stopwords
import os

nltk.download('stopwords')
french_stop_words = set(stopwords.words('french'))

french_stop_words.update(
    ['.', ',', '(', ')', '\'', '-', ';', ':', '</s>', '/', '\\', '\',','d\''])


def remove_stopwords_from_file(input_path, output_path):
    print(f"Processing {input_path}...")

    if not os.path.exists(input_path):
        print(f"❌ File not found: {input_path}")
        return

    with open(input_path, 'r', encoding='utf-8') as fin, \
            open(output_path, 'w', encoding='utf-8') as fout:

        for line in fin:
            # Split line into words (tokens)
            tokens = line.strip().split()

            # Keep only words NOT in the stop word list
            filtered_tokens = [
                t for t in tokens if t.lower() not in french_stop_words]

            # Write back to new file if the line is not empty
            if filtered_tokens:
                fout.write(" ".join(filtered_tokens) + "\n")

    print(f"✅ Created cleaned file: {output_path}")



base_dir = 'TP_ISD2020'
med_original = os.path.join(
    base_dir, 'QUAERO_FrenchMed', 'QUAERO_FrenchMed_traindev.ospl')
press_original = os.path.join(
    base_dir, 'QUAERO_FrenchPress', 'QUAERO_FrenchPress_traindev.ospl')

med_cleaned = "QUAERO_FrenchMed_cleaned.ospl"
press_cleaned = "QUAERO_FrenchPress_cleaned.ospl"

remove_stopwords_from_file(med_original, med_cleaned)
remove_stopwords_from_file(press_original, press_cleaned)

Processing TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl...
✅ Created cleaned file: QUAERO_FrenchMed_cleaned.ospl
Processing TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/idrissamahamoudoudicko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Created cleaned file: QUAERO_FrenchPress_cleaned.ospl


In [149]:
import logging
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

logging.basicConfig(format='%(asctime)s : %(levelname)s:%(message)s', level=logging.INFO)
corpus_med = "QUAERO_FrenchMed_cleaned.ospl"
corpus_press = "QUAERO_FrenchPress_cleaned.ospl"

DIM = 100
MIN_COUNT = 1
def train_w2v(corpus_file, model_type, name_suffix):
    print(f"--- Training {name_suffix} ({'Skkipgram' if model_type == 1 else 'CBOW'})")
    model = Word2Vec(sentences=LineSentence(corpus_file),
                     vector_size=DIM,
                     min_count = MIN_COUNT,
                     sg = model_type,
                     workers = 4)
    filename = f"w2v_{name_suffix}_{'sg' if model_type==1 else 'cbow'}.model"
    model.save(filename)
    print(f'saved:{filename}')

In [150]:

train_w2v(corpus_med, 0, "med")  # CBOW
train_w2v(corpus_med, 1, "med")  # Skipgram

2026-01-23 16:07:23,380 : INFO:collecting all words and their counts
2026-01-23 16:07:23,384 : INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-01-23 16:07:23,421 : INFO:collected 8937 word types from a corpus of 28225 raw words and 3021 sentences
2026-01-23 16:07:23,423 : INFO:Creating a fresh vocabulary
2026-01-23 16:07:23,469 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 8937 unique words (100.00% of original 8937, drops 0)', 'datetime': '2026-01-23T16:07:23.469083', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-01-23 16:07:23,470 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 28225 word corpus (100.00% of original 28225, drops 0)', 'datetime': '2026-01-23T16:07:23.470325', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:5

--- Training med (CBOW)


2026-01-23 16:07:23,554 : INFO:deleting the raw counts dictionary of 8937 items
2026-01-23 16:07:23,559 : INFO:sample=0.001 downsamples 26 most-common words
2026-01-23 16:07:23,563 : INFO:Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 26712.628874093698 word corpus (94.6%% of prior 28225)', 'datetime': '2026-01-23T16:07:23.562966', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-01-23 16:07:23,666 : INFO:estimated required memory for 8937 words and 100 dimensions: 11618100 bytes
2026-01-23 16:07:23,667 : INFO:resetting layer weights
2026-01-23 16:07:23,681 : INFO:Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2026-01-23T16:07:23.681204', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 

saved:w2v_med_cbow.model
--- Training med (Skkipgram)


2026-01-23 16:07:24,199 : INFO:EPOCH 1: training on 28225 raw words (26757 effective words) took 0.1s, 516450 effective words/s
2026-01-23 16:07:24,253 : INFO:EPOCH 2: training on 28225 raw words (26721 effective words) took 0.1s, 507177 effective words/s
2026-01-23 16:07:24,307 : INFO:EPOCH 3: training on 28225 raw words (26700 effective words) took 0.1s, 505753 effective words/s
2026-01-23 16:07:24,363 : INFO:EPOCH 4: training on 28225 raw words (26706 effective words) took 0.1s, 484116 effective words/s
2026-01-23 16:07:24,364 : INFO:Word2Vec lifecycle event {'msg': 'training on 141125 raw words (133615 effective words) took 0.3s, 492302 effective words/s', 'datetime': '2026-01-23T16:07:24.364284', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'train'}
2026-01-23 16:07:24,364 : INFO:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=8937, vector_size=100, alp

saved:w2v_med_sg.model


In [151]:
train_w2v(corpus_press, 0, "press")  # CBOW
train_w2v(corpus_press, 1, "press")  # Skipgram

2026-01-23 16:07:24,389 : INFO:collecting all words and their counts
2026-01-23 16:07:24,390 : INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-01-23 16:07:24,455 : INFO:PROGRESS: at sentence #10000, processed 172881 words, keeping 19801 word types
2026-01-23 16:07:24,522 : INFO:PROGRESS: at sentence #20000, processed 328008 words, keeping 28641 word types


--- Training press (CBOW)


2026-01-23 16:07:24,608 : INFO:PROGRESS: at sentence #30000, processed 501568 words, keeping 34911 word types
2026-01-23 16:07:24,719 : INFO:collected 39468 word types from a corpus of 678774 raw words and 38546 sentences
2026-01-23 16:07:24,719 : INFO:Creating a fresh vocabulary
2026-01-23 16:07:24,829 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 39468 unique words (100.00% of original 39468, drops 0)', 'datetime': '2026-01-23T16:07:24.829924', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-01-23 16:07:24,830 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 678774 word corpus (100.00% of original 678774, drops 0)', 'datetime': '2026-01-23T16:07:24.830427', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm6

saved:w2v_press_cbow.model
--- Training press (Skkipgram)


2026-01-23 16:07:28,242 : INFO:Creating a fresh vocabulary
2026-01-23 16:07:28,318 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 39468 unique words (100.00% of original 39468, drops 0)', 'datetime': '2026-01-23T16:07:28.318671', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-01-23 16:07:28,319 : INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 678774 word corpus (100.00% of original 678774, drops 0)', 'datetime': '2026-01-23T16:07:28.319148', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'prepare_vocab'}
2026-01-23 16:07:28,447 : INFO:deleting the raw counts dictionary of 39468 items
2026-01-23 16:07:28,448 : INFO:sample=0.001 downsamples 25 most-common words
2026-01-23 16:07:

saved:w2v_press_sg.model


In [152]:
import fasttext
def train_ft(corpus_file, name_suffix):
    print(f"-- Training FastTest CBOW on {name_suffix}")

    model = fasttext.train_unsupervised(corpus_file, model='cbow',dim=1, minCount=1)
    filename_bin = f'ft_{name_suffix}_cbow.bin'
    model.save_model(filename_bin)
    print(f"Saved: {filename_bin}")

train_ft(corpus_med, "med")
train_ft(corpus_press, "press")

Read 0M words
Number of words:  8938
Number of labels: 0


-- Training FastTest CBOW on med


Progress: 100.0% words/sec/thread:  221014 lr:  0.000000 avg.loss:  3.556739 ETA:   0h 0m 0s


Saved: ft_med_cbow.bin
-- Training FastTest CBOW on press


Read 0M words
Number of words:  39469
Number of labels: 0
Progress:  99.0% words/sec/thread:  404509 lr:  0.000519 avg.loss:  2.603269 ETA:   0h 0m 0s

Saved: ft_press_cbow.bin


Progress: 100.0% words/sec/thread:  377235 lr:  0.000000 avg.loss:  2.601263 ETA:   0h 0m 0s


In [153]:
from gensim.models import KeyedVectors
import pandas as pd

words = ['patient', 'traitement', 'maladie', 'solution', 'jaune']

def get_neighbors(model, word, approach_name):
    try:
        if hasattr(model, 'wv'):
            neighbors = model.wv.most_similar(word, topn=5)
        elif hasattr(model, 'get_nearest_neighbors'):
            neighbors = [(w,s) for s, w in model.get_nearest_neighbors(word, k=5)]
        else:
            return "Error"
    except KeyError:
        return "Word not found"

In [154]:
import pandas as pd
from gensim.models import Word2Vec
import fasttext

# Load Models (Ensure these variables are active in memory)
w2v_med_cbow = Word2Vec.load("w2v_med_cbow.model")
w2v_med_sg = Word2Vec.load("w2v_med_sg.model")
w2v_press_cbow = Word2Vec.load("w2v_press_cbow.model")
w2v_press_sg = Word2Vec.load("w2v_press_sg.model")

# Load FastText (Re-loading to be safe)
ft_med = fasttext.load_model("ft_med_cbow.bin")
ft_press = fasttext.load_model("ft_press_cbow.bin")


def get_neighbors_safe(model, word):
    """
    Robust function to get neighbors.
    Returns the error message if something fails.
    """
    try:
        # GENSIM (Word2Vec)
        if hasattr(model, 'wv'):
            # Check if word is in vocab first
            if word not in model.wv.key_to_index:
                return "Not in Vocab"
            neighbors = model.wv.most_similar(word, topn=5)
            return ", ".join([n[0] for n in neighbors])

        # FASTTEXT (Library object)
        elif hasattr(model, 'get_nearest_neighbors'):
            neighbors = model.get_nearest_neighbors(word, k=5)
            # FastText returns (score, word), we want just the word
            return ", ".join([n[1] for n in neighbors])

        else:
            return "Unknown Model Type"

    except Exception as e:
        return f"Error: {str(e)}"


# Run Comparison
words = ['patient', 'traitement', 'maladie', 'solution', 'jaune']
results = []

for word in words:
    entry = {"Word": word}
    entry["Med (W2V-CBOW)"] = get_neighbors_safe(w2v_med_cbow, word)
    entry["Med (W2V-SG)"] = get_neighbors_safe(w2v_med_sg, word)
    entry["Med (FT)"] = get_neighbors_safe(ft_med, word)
    entry["Press (W2V-CBOW)"] = get_neighbors_safe(w2v_press_cbow, word)
    entry["Press (FT)"] = get_neighbors_safe(ft_press, word)
    results.append(entry)

# Display
df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
# Use display(df) if in Colab, otherwise print(df)
try:
    display(df)
except:
    print(df)

2026-01-23 16:07:38,580 : INFO:loading Word2Vec object from w2v_med_cbow.model
2026-01-23 16:07:38,597 : INFO:loading wv recursively from w2v_med_cbow.model.wv.* with mmap=None
2026-01-23 16:07:38,599 : INFO:setting ignored attribute cum_table to None
2026-01-23 16:07:38,653 : INFO:Word2Vec lifecycle event {'fname': 'w2v_med_cbow.model', 'datetime': '2026-01-23T16:07:38.653420', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forge | (main, Jan  8 2025, 09:15:59) [Clang 18.1.8 ]', 'platform': 'macOS-26.1-arm64-arm-64bit-Mach-O', 'event': 'loaded'}
2026-01-23 16:07:38,655 : INFO:loading Word2Vec object from w2v_med_sg.model
2026-01-23 16:07:38,704 : INFO:loading wv recursively from w2v_med_sg.model.wv.* with mmap=None
2026-01-23 16:07:38,727 : INFO:setting ignored attribute cum_table to None
2026-01-23 16:07:38,832 : INFO:Word2Vec lifecycle event {'fname': 'w2v_med_sg.model', 'datetime': '2026-01-23T16:07:38.832297', 'gensim': '4.4.0', 'python': '3.13.1 | packaged by conda-forg

Unnamed: 0,Word,Med (W2V-CBOW),Med (W2V-SG),Med (FT),Press (W2V-CBOW),Press (FT)
0,patient,"a, traitement, ’, chez, ""","symptômes, médicament, si, maladie, risque","chez, cas, Plaidoyer, dysorthographie, lymphangiomes","science, virus, voient, clients, profit","a, c', hilarité, anticycloniques, Bellaouchi"
1,traitement,"’, chez, être, médecin, patients","médecin, TYSABRI, peut, devra, autres","chez, être, Plaidoyer, dysorthographie, lymphangiomes","contenu, mariage, démocratiques, financiers, coût","a, c', hilarité, anticycloniques, Bellaouchi"
2,maladie,"’, chez, patients, médecin, traitement","étude, cours, clinique, symptômes, •","chez, cas, Plaidoyer, dysorthographie, lymphangiomes","sauver, main, efficacité, terme, effectifs","a, c', hilarité, anticycloniques, Bellaouchi"
3,solution,"mg, perfusion, 1, ’, ml","flacon, contient, 20, mg, 100","chez, cas, Plaidoyer, dysorthographie, lymphangiomes","liberté, règle, désir, géopolitique, défi","a, c', hilarité, anticycloniques, Bellaouchi"
4,jaune,"manifester, fréquences, immuns, propriétés, Dieu","entre, tolcapone, sodium, Etude, rapport","chez, cas, Plaidoyer, dysorthographie, lymphangiomes","I, von, Ariane, Maurice, Airways","a, c', hilarité, anticycloniques, Bellaouchi"
