In [1]:
import re
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## 1. Pre-processing ##

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

def normalize_text(text):
    """
    Delete specials characters, mistakes correction, cleaning text, delete stop words

    Args:
        text (str): Raw text.

    Returns:
        str: Normalized text.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = nltk.WordNetLemmatizer()
    corrected_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    #Normalized text reconstructed
    normalized_text = ' '.join(corrected_tokens)

    return normalized_text


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Loïc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Loïc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Loïc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Loïc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [5]:
official_transcript_path = "data/biden-sotu-2023-planned-official.txt"
autogenerated_transcript_path = "data/biden-sotu-2023-autogenerated-transcript.txt"

with open(official_transcript_path, 'r', encoding='utf-8') as file:
    official_transcript = file.read()

with open(autogenerated_transcript_path, 'r', encoding='utf-8') as file:
    autogenerated_transcript = file.read()

normalized_official_transcript = normalize_text(official_transcript)
normalized_autogenerated_transcript = normalize_text(autogenerated_transcript)

with open("data/normalized_official_transcript.txt", 'w', encoding='utf-8') as file:
    file.write(normalized_official_transcript)

with open("data/normalized_autogenerated_transcript.txt", 'w', encoding='utf-8') as file:
    file.write(normalized_autogenerated_transcript)

print("Normalization over. Normalized files have been saved.")

Normalization over. Normalized files have been saved.


## 2. Data indexation ##

In [6]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import re

def split_into_passages(text, passage_size=100):
    """
    Divise un texte en passages de taille fixe.

    Args:
        text (str): Texte brut.
        passage_size (int): Taille d'un passage (en nombre de mots).

    Returns:
        list: Liste de passages.
    """
    words = text.split()
    passages = [
        ' '.join(words[i:i + passage_size]) for i in range(0, len(words), passage_size)
    ]
    return passages

# Étape 2 : Encodage des passages
def encode_passages(passages, model_name="all-MiniLM-L6-v2"):
    """
    Encode les passages en vecteurs en utilisant SentenceTransformers.

    Args:
        passages (list): Liste de passages.
        model_name (str): Nom du modèle SentenceTransformers.

    Returns:
        np.ndarray: Matrice des vecteurs encodés.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(passages, show_progress_bar=True)
    return np.array(embeddings)

# Étape 3 : Construire l'index avec FAISS
def build_faiss_index(embeddings):
    """
    Crée un index FAISS à partir des vecteurs encodés.

    Args:
        embeddings (np.ndarray): Matrice des vecteurs encodés.

    Returns:
        faiss.IndexFlatL2: Index FAISS.
    """
    d = embeddings.shape[1]  # Dimension des vecteurs
    index = faiss.IndexFlatL2(d)  # Index avec L2 (distance euclidienne)
    index.add(embeddings)  # Ajouter les vecteurs à l'index
    return index

# Étape 4 : Workflow complet
def main():
    # Chemins des fichiers
    official_transcript_path = "data/biden-sotu-2023-planned-official.txt"
    autogenerated_transcript_path = "data/biden-sotu-2023-autogenerated-transcript.txt"

    # Charger et fusionner les textes
    official_text = load_text(official_transcript_path)
    autogenerated_text = load_text(autogenerated_transcript_path)
    combined_text = official_text + "\n" + autogenerated_text

    # Nettoyage de base du texte (optionnel)
    combined_text = re.sub(r'\s+', ' ', combined_text).strip()  # Supprimer espaces multiples

    # Diviser en passages
    passages = split_into_passages(combined_text, passage_size=100)

    # Encoder les passages
    embeddings = encode_passages(passages)

    # Construire l'index FAISS
    index = build_faiss_index(embeddings)

    # Sauvegarder les passages et l'index pour utilisation future
    np.save("passages.npy", passages)  # Sauvegarder les passages
    faiss.write_index(index, "faiss_index.bin")  # Sauvegarder l'index

    print("Index FAISS créé et sauvegardé avec succès !")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.