In [None]:
# !pip install pdfplumber
# !pip install -U spacy
# !python -m spacy download fr_core_news_sm

In [1]:
import spacy
import joblib

In [2]:
all_sentences = joblib.load("../models/all_sentences.pkl")
texts = joblib.load("../data/texts.pkl")
sources = joblib.load("../data/sources.pkl")
vectorizer = joblib.load("../models/tfidf_vectorizer.pkl")
X = joblib.load("../data/tfidf_matrix.pkl")

In [4]:
nlp = spacy.load("fr_core_news_sm")

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(query, top_k=3):
    query_clean = preprocess(query)
    query_vec = vectorizer.transform([query_clean])
    scores = cosine_similarity(query_vec, X)[0]
    top_indices = np.argsort(scores)[::-1][:top_k]

    results = []
    for i in top_indices:
        results.append({
            "score": scores[i],
            "text": texts[i],
            "source": sources[i]
        })
    return results


In [6]:
results = search("Que est l'hymne national de la france")

for res in results:
    print(f"[{res['score']:.4f}] (source: {res['source']})")
    print(f"→ {res['text']}\n")


[0.6749] (source: Constitution)
→ L'hymne national est « La Marseillaise »

[0.3562] (source: code penal)
→ Article 433-5-1
Le fait, au cours d'une manifestation organisée ou réglementée par les autorités publiques, d'outrager
publiquement l'hymne national ou le drapeau tricolore est puni de 7 500 euros d'amende

[0.2478] (source: code de travail)
→ Paragraphe 2 : Service national

