In [1]:
import matplotlib.pyplot as plt

from tqdm import tqdm

from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

#from utils_func import corpus_processing, matrix_creation, clustering, retriever_model

import os

import spacy
from tqdm import tqdm
import pandas as pd

from multiprocessing import Pool, cpu_count
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

  from tqdm.autonotebook import tqdm


In [2]:
import spacy
from tqdm import tqdm
import pandas as pd

from multiprocessing import Pool, cpu_count
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
clean_tokens = lambda tokens : ' '.join([token.lemma_.lower() for token in tokens if token.lemma_.lower() not in stopwords and not token.is_punct])

def pre_process(elem_to_preprocess: tuple[int, dict[str,str]]) -> tuple[int, str]:
  """
  Preprocesses the text data in the corpus\n
  :param elem_to_preprocess: Tuple containing the key and the value of the element to preprocess\n
  :return: Tuple containing the key and the preprocessed text data\n
  """

  key, val = elem_to_preprocess
  return key, f"{clean_tokens(nlp(val['title'].lower()))} {clean_tokens(nlp(val['text'].lower()))}" # Cleaning the text document

# Helper function to process a single key-value pair
def process_item(item):
    key, value = item
    return key, pre_process((key, value))[1]

def preprocess_corpus_dict(corpus: dict[int, dict[str, str]]) -> dict[int, str]:
    """
    Preprocesses the text data in the corpus in parallel\n
    :param corpus: The corpus to preprocess
    :return: The preprocessed corpus
    """
    # Convert the corpus to a list of items for parallel processing
    items = list(corpus.items())

    # Use a multiprocessing pool for parallel execution
    with Pool(cpu_count()) as pool:
        results = list(tqdm(pool.imap(process_item, items), total=len(items)))

    # Combine results into a dictionary
    cleaned_corpus = {key: value for key, value in results}

    return cleaned_corpus

""" Non parallel version

def preprocess_corpus_dict(corpus: dict[int, dict[str,str]]) -> dict[int, str]:
    '''
    Preprocesses the text data in the corpus\n
    :param corpus: The corpus to preprocess\n
    :return: The preprocessed corpus\n
    '''
    cleaned_corpus = {}
    for key in tqdm(corpus.keys()):
        cleaned_corpus[key] = pre_process((key, corpus[key]))[1]
    return cleaned_corpus
"""

def get_unique_words(corpus:dict[int, str]) -> set:
    """
    Function to find the unique words in a corpus
    :param corpus: dict[int, str] - a dictionary with the key being the document id and the value being the document text
    :return: set - a set of unique words in the corpus
    """
    unique_words = set()
    for doc_id in tqdm(corpus, desc="Getting unique words"):
        text = corpus[doc_id]
        text = text.split()
        for word in text:
            unique_words.add(word)
    return unique_words

def save_processed_corpus(corpus, path_to_save):
    """
    Saves the preprocessed corpus to a csv file\n
    :param corpus: The preprocessed corpus\n
    :param path_to_save: The path to save the preprocessed corpus\n
    """

    df = pd.DataFrame.from_dict(corpus, orient='index')
    df.reset_index(inplace=True)
    df.columns = ["doc_id", "text"]
    df.to_csv(path_to_save, index=False)



In [3]:
from utils_func import corpus_processing

In [4]:
# Example corpus and queries (replace with your actual data)
#dataset = "scidocs"
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 106438.95it/s]


In [5]:
cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)

Prétraitement du corpus: 100%|██████████| 3633/3633 [00:26<00:00, 134.58it/s]


In [6]:
import tempfile

with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
    for keys in tqdm(cleaned_corpus.keys()):
        temp_file.write(f"{cleaned_corpus[keys]}\n")
    temp_file_path = temp_file.name

100%|██████████| 3633/3633 [00:00<00:00, 146485.94it/s]


In [7]:
import tempfile
import fasttext
import re
import numpy as np

dim = 100
'''
with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
    temp_file.write(text)
    temp_file_path = temp_file.name
'''
# Entraîner le modèle FastText avec le fichier temporaire
model = fasttext.train_unsupervised(temp_file_path, model='skipgram', epoch = 5, dim=dim)
# Liste des mots à visualiser (tu peux la personnaliser)
#if dim > 100:
#    fasttext.util.reduce_model(model, 100)

#words = re.findall(r'\b\w+\b', text.lower())
unique_words = list(get_unique_words(cleaned_corpus))
#unique_words = list(set(words))

# Obtenir les vecteurs pour les mots choisis
word_vectors = np.array([model.get_word_vector(word) for word in unique_words])

embeddings = pd.DataFrame(word_vectors, index=unique_words)
embeddings.to_csv(f"word_vectors/word_vectors_{dataset}.csv", sep = ' ')
'''
# Path to the output file
output_file = f"word_vectors_{dataset}.txt"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop over each word and its corresponding vector
    for word, vector in zip(unique_words, word_vectors):
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and its vector, followed by a newline
        f.write(f"{word} {vector_str}\n")
'''

model.save_model(f'models/fasttext_{dataset}')

Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 28395.40it/s]


In [8]:
words = set(model.words)
words_emb = set(list(embeddings.index))
len(words), len(words_emb), len(words.intersection(words_emb))

(8615, 28549, 8614)

In [27]:
model.get_word_vector("study")

array([-1.24736972e-01,  1.55483544e-01,  1.28757864e-01,  1.05919413e-01,
       -1.72636896e-01,  2.19534859e-01,  3.77277106e-01, -2.17107505e-01,
        9.40025225e-02, -3.75892818e-02,  3.26620609e-01,  2.29673848e-01,
       -5.97025692e-01,  3.68287772e-01, -1.34355962e-01,  1.30748555e-01,
        3.06492895e-01, -3.39102894e-01,  3.43339235e-01, -1.72479391e-01,
       -4.86842215e-01, -4.16942954e-01,  2.15740558e-02,  6.34661391e-02,
       -7.41366893e-02,  1.33865386e-01,  1.39842197e-01,  1.78903669e-01,
       -2.07893655e-01, -2.45207608e-01,  1.94883406e-01, -8.47381949e-02,
       -8.78266525e-03, -3.35278273e-01,  2.33303279e-01,  2.82787442e-01,
       -1.74766675e-01, -4.27321941e-02, -1.31116837e-01, -1.05515592e-01,
        2.93980420e-01,  2.79508412e-01,  1.21777266e-01, -2.61485904e-01,
       -2.34878153e-01,  1.71515167e-01,  3.13903749e-01, -1.24379329e-01,
        3.41024667e-01, -1.10647030e-01, -1.10391782e-04, -2.17686027e-01,
       -3.54332745e-01, -

In [32]:
vect = [model.get_word_vector(i) for i in model.words]

In [34]:
pd.DataFrame(vect, index = model.words).to_csv(f"word_vectors/word_vectors_{dataset}_test.csv", sep = ' ')

In [9]:
unique_words[0]

'retinal'

In [10]:
model.get_nearest_neighbors(unique_words[0])

[(0.8931265473365784, 'retina'),
 (0.7388615608215332, 'retinopathy'),
 (0.7326111793518066, 'spinal'),
 (0.7128815650939941, 'atrophy'),
 (0.7120115160942078, 'retinitis'),
 (0.7038640379905701, 'svd'),
 (0.7029131054878235, 'macula'),
 (0.6998583674430847, 'vein'),
 (0.6985985040664673, 'iris'),
 (0.6962120532989502, 'reticulum')]

In [1]:
model.get_nearest_neighbors('8.3', k=50)

NameError: name 'model' is not defined

In [6]:
import bm25s

# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "\\cos(e^{i\\pi}) = -1"
]

# Tokenize the corpus and index it
corpus_tokens = bm25s.tokenize(corpus)
retriever = bm25s.BM25(corpus=corpus)
retriever.index(corpus_tokens)

                                                           

In [7]:
corpus_tokens

Tokenized(ids=[[0, 1, 2, 3], [4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16]], vocab={'cat': 0, 'feline': 1, 'likes': 2, 'purr': 3, 'dog': 4, 'human': 5, 'best': 6, 'friend': 7, 'loves': 8, 'play': 9, 'bird': 10, 'beautiful': 11, 'animal': 12, 'can': 13, 'fly': 14, 'cos': 15, 'pi': 16, '': 17})

In [3]:

# You can now search the corpus with a query
query = "does the fish purr like a cat?"
query_tokens = bm25s.tokenize(query)
docs, scores = retriever.retrieve(query_tokens, k=2)
print(f"Best result (score: {scores[0, 0]:.2f}): {docs[0, 0]}")

                                                     

Best result (score: 0.86): a cat is a feline and likes to purr




# Evaluate fasttext

In [None]:
import fasttext
import numpy as np
from sklearn.metrics import accuracy_score

def train_and_evaluate(train_file, valid_file, epoch_values=[1, 5, 10, 20, 30, 50]):
    best_epoch = None
    best_acc = 0.0
    
    for epoch in epoch_values:
        print(f"Training with epoch = {epoch}")
        model = fasttext.train_supervised(train_file, epoch=epoch, lr=0.1, wordNgrams=2)
        
        # Evaluate on validation set
        true_labels = []
        pred_labels = []
        
        with open(valid_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split(" ")
                    label = parts[0]  # FastText assumes labels start with '__label__'
                    text = " ".join(parts[1:])
                    pred = model.predict(text)[0][0]  # Get top prediction
                    
                    true_labels.append(label)
                    pred_labels.append(pred)
        
        acc = accuracy_score(true_labels, pred_labels)
        print(f"Epoch {epoch} - Validation Accuracy: {acc:.4f}")
        
        if acc > best_acc:
            best_acc = acc
            best_epoch = epoch
    
    print(f"Best epoch: {best_epoch} with accuracy: {best_acc:.4f}")
    return best_epoch

# Test

In [None]:
import tempfile
import fasttext
import re
import numpy as np
from tqdm import tqdm


print('load pretrained model')
pretrained_model = fasttext.load_model("cc.en.100.bin")
print('pretrained model loaded')

with open("pretrained.vec", "w") as f:
    f.write(f"{len(pretrained_model.words)} 100\n")  # First line: vocab size, dimension
    for word in tqdm(pretrained_model.words, 'writing vec files'):
        vec = " ".join(map(str, pretrained_model.get_word_vector(word)))
        f.write(f"{word} {vec}\n")

print('training finetuned model')
# Liste des mots à visualiser (tu peux la personnaliser)
model = fasttext.train_unsupervised(temp_file_path, model='skipgram', pretrainedVectors='pretrained.vec')
# Liste des mots à visualiser (tu peux la personnaliser)
print("end of training finetuned model")

#words = re.findall(r'\b\w+\b', text.lower())
unique_words = list(get_unique_words(cleaned_corpus))
#unique_words = list(set(words))

# Obtenir les vecteurs pour les mots choisis
word_vectors = np.array([model.get_word_vector(word) for word in unique_words])

embeddings = pd.DataFrame(word_vectors, index=unique_words)
embeddings.to_csv(f"word_vectors/word_vectors_{dataset}.csv", sep = ' ')
'''
# Path to the output file
output_file = f"word_vectors_{dataset}.txt"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop over each word and its corresponding vector
    for word, vector in zip(unique_words, word_vectors):
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and its vector, followed by a newline
        f.write(f"{word} {vector_str}\n")
'''

model.save_model(f'models/fasttext_{dataset}')

Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 55437.78it/s]


TypeError: '_FastText' object does not support item assignment

In [9]:
import tempfile
import fasttext
import re
import numpy as np
from tqdm import tqdm


model = fasttext.load_model('cc.en.100.bin')
unique_words = list(get_unique_words(cleaned_corpus))
#unique_words = list(set(words))

# Obtenir les vecteurs pour les mots choisis
word_vectors = np.array([model.get_word_vector(word) for word in unique_words])

embeddings = pd.DataFrame(word_vectors, index=unique_words)
embeddings.to_csv(f"word_vectors/word_vectors_{dataset}.csv", sep = ' ')

'''
# Path to the output file
output_file = f"word_vectors_{dataset}.txt"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop over each word and its corresponding vector
    for word, vector in zip(unique_words, word_vectors):
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and its vector, followed by a newline
        f.write(f"{word} {vector_str}\n")
'''

Getting unique words: 100%|██████████| 57638/57638 [00:01<00:00, 55556.01it/s]


'\n# Path to the output file\noutput_file = f"word_vectors_{dataset}.txt"\n\n# Open the file in write mode\nwith open(output_file, "w") as f:\n    # Loop over each word and its corresponding vector\n    for word, vector in zip(unique_words, word_vectors):\n        # Convert the vector to a space-separated string\n        vector_str = \' \'.join(map(str, vector))\n        # Write the word and its vector, followed by a newline\n        f.write(f"{word} {vector_str}\n")\n'