In [1]:
import matplotlib.pyplot as plt

from tqdm import tqdm

from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

#from utils_func import corpus_processing, matrix_creation, clustering, retriever_model

import os

import spacy
from tqdm import tqdm
import pandas as pd

from multiprocessing import Pool, cpu_count
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

  from tqdm.autonotebook import tqdm


In [2]:
import spacy
from tqdm import tqdm
import pandas as pd

from multiprocessing import Pool, cpu_count
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
clean_tokens = lambda tokens : ' '.join([token.lemma_.lower() for token in tokens if token.lemma_.lower() not in stopwords and not token.is_punct])

def pre_process(elem_to_preprocess: tuple[int, dict[str,str]]) -> tuple[int, str]:
  """
  Preprocesses the text data in the corpus\n
  :param elem_to_preprocess: Tuple containing the key and the value of the element to preprocess\n
  :return: Tuple containing the key and the preprocessed text data\n
  """

  key, val = elem_to_preprocess
  return key, f"{clean_tokens(nlp(val['title'].lower()))} {clean_tokens(nlp(val['text'].lower()))}" # Cleaning the text document

# Helper function to process a single key-value pair
def process_item(item):
    key, value = item
    return key, pre_process((key, value))[1]

def preprocess_corpus_dict(corpus: dict[int, dict[str, str]]) -> dict[int, str]:
    """
    Preprocesses the text data in the corpus in parallel\n
    :param corpus: The corpus to preprocess
    :return: The preprocessed corpus
    """
    # Convert the corpus to a list of items for parallel processing
    items = list(corpus.items())

    # Use a multiprocessing pool for parallel execution
    with Pool(cpu_count()) as pool:
        results = list(tqdm(pool.imap(process_item, items), total=len(items)))

    # Combine results into a dictionary
    cleaned_corpus = {key: value for key, value in results}

    return cleaned_corpus

""" Non parallel version

def preprocess_corpus_dict(corpus: dict[int, dict[str,str]]) -> dict[int, str]:
    '''
    Preprocesses the text data in the corpus\n
    :param corpus: The corpus to preprocess\n
    :return: The preprocessed corpus\n
    '''
    cleaned_corpus = {}
    for key in tqdm(corpus.keys()):
        cleaned_corpus[key] = pre_process((key, corpus[key]))[1]
    return cleaned_corpus
"""

def get_unique_words(corpus:dict[int, str]) -> set:
    """
    Function to find the unique words in a corpus
    :param corpus: dict[int, str] - a dictionary with the key being the document id and the value being the document text
    :return: set - a set of unique words in the corpus
    """
    unique_words = set()
    for doc_id in tqdm(corpus, desc="Getting unique words"):
        text = corpus[doc_id]
        text = text.split()
        for word in text:
            unique_words.add(word)
    return unique_words

def save_processed_corpus(corpus, path_to_save):
    """
    Saves the preprocessed corpus to a csv file\n
    :param corpus: The preprocessed corpus\n
    :param path_to_save: The path to save the preprocessed corpus\n
    """

    df = pd.DataFrame.from_dict(corpus, orient='index')
    df.reset_index(inplace=True)
    df.columns = ["doc_id", "text"]
    df.to_csv(path_to_save, index=False)

In [5]:
from utils_func import corpus_processing

In [6]:
# Example corpus and queries (replace with your actual data)
#dataset = "scidocs"
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 24760.70it/s]


In [7]:
cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)

Prétraitement du corpus: 100%|██████████| 3633/3633 [00:26<00:00, 138.83it/s]


In [8]:
import tempfile

with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
    for keys in tqdm(cleaned_corpus.keys()):
        temp_file.write(f"{cleaned_corpus[keys]}\n")
    temp_file_path = temp_file.name

100%|██████████| 3633/3633 [00:00<00:00, 202526.70it/s]


In [9]:
import tempfile
import fasttext
import re
import numpy as np

'''
with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
    temp_file.write(text)
    temp_file_path = temp_file.name
'''
# Entraîner le modèle FastText avec le fichier temporaire
model = fasttext.train_unsupervised(temp_file_path, model='skipgram')
# Liste des mots à visualiser (tu peux la personnaliser)

#words = re.findall(r'\b\w+\b', text.lower())
unique_words = list(get_unique_words(cleaned_corpus))
#unique_words = list(set(words))

# Obtenir les vecteurs pour les mots choisis
word_vectors = np.array([model.get_word_vector(word) for word in unique_words])

embeddings = pd.DataFrame(word_vectors, index=unique_words)
embeddings.to_csv(f"word_vectors/word_vectors_{dataset}.csv", sep = ' ')
'''
# Path to the output file
output_file = f"word_vectors_{dataset}.txt"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop over each word and its corresponding vector
    for word, vector in zip(unique_words, word_vectors):
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and its vector, followed by a newline
        f.write(f"{word} {vector_str}\n")
'''

model.save_model(f'models/fasttext_{dataset}')

Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 47901.98it/s]


In [10]:
len(unique_words)

29043

# Test

In [None]:
import tempfile
import fasttext
import re
import numpy as np
from tqdm import tqdm


print('load pretrained model')
pretrained_model = fasttext.load_model("cc.en.100.bin")
print('pretrained model loaded')

with open("pretrained.vec", "w") as f:
    f.write(f"{len(pretrained_model.words)} 100\n")  # First line: vocab size, dimension
    for word in tqdm(pretrained_model.words, 'writing vec files'):
        vec = " ".join(map(str, pretrained_model.get_word_vector(word)))
        f.write(f"{word} {vec}\n")

print('training finetuned model')
# Liste des mots à visualiser (tu peux la personnaliser)
model = fasttext.train_unsupervised(temp_file_path, model='skipgram', pretrainedVectors='pretrained.vec')
# Liste des mots à visualiser (tu peux la personnaliser)
print("end of training finetuned model")

#words = re.findall(r'\b\w+\b', text.lower())
unique_words = list(get_unique_words(cleaned_corpus))
#unique_words = list(set(words))

# Obtenir les vecteurs pour les mots choisis
word_vectors = np.array([model.get_word_vector(word) for word in unique_words])

embeddings = pd.DataFrame(word_vectors, index=unique_words)
embeddings.to_csv(f"word_vectors/word_vectors_{dataset}.csv", sep = ' ')
'''
# Path to the output file
output_file = f"word_vectors_{dataset}.txt"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop over each word and its corresponding vector
    for word, vector in zip(unique_words, word_vectors):
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and its vector, followed by a newline
        f.write(f"{word} {vector_str}\n")
'''

model.save_model(f'models/fasttext_{dataset}')

Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 55437.78it/s]


TypeError: '_FastText' object does not support item assignment

In [9]:
import tempfile
import fasttext
import re
import numpy as np
from tqdm import tqdm


model = fasttext.load_model('cc.en.100.bin')
unique_words = list(get_unique_words(cleaned_corpus))
#unique_words = list(set(words))

# Obtenir les vecteurs pour les mots choisis
word_vectors = np.array([model.get_word_vector(word) for word in unique_words])

embeddings = pd.DataFrame(word_vectors, index=unique_words)
embeddings.to_csv(f"word_vectors/word_vectors_{dataset}.csv", sep = ' ')

'''
# Path to the output file
output_file = f"word_vectors_{dataset}.txt"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop over each word and its corresponding vector
    for word, vector in zip(unique_words, word_vectors):
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and its vector, followed by a newline
        f.write(f"{word} {vector_str}\n")
'''

Getting unique words: 100%|██████████| 57638/57638 [00:01<00:00, 55556.01it/s]


'\n# Path to the output file\noutput_file = f"word_vectors_{dataset}.txt"\n\n# Open the file in write mode\nwith open(output_file, "w") as f:\n    # Loop over each word and its corresponding vector\n    for word, vector in zip(unique_words, word_vectors):\n        # Convert the vector to a space-separated string\n        vector_str = \' \'.join(map(str, vector))\n        # Write the word and its vector, followed by a newline\n        f.write(f"{word} {vector_str}\n")\n'