In [None]:
# !pip install pdfplumber

In [None]:
# !pip install -U spacy

In [None]:
# !python -m spacy download fr_core_news_sm

In [1]:
import pdfplumber
import spacy
import re
import joblib



In [2]:
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

In [3]:
def remove_page_headers(text):
    # Remove lines containing "1er janvier 2015" (or any variant like "1 janvier 2015")
    cleaned = re.sub(r'\b1(?:er)?\s+janvier\s+2015\b', '', text, flags=re.IGNORECASE)
    return cleaned

In [4]:
# nlp = spacy.load("fr_core_news_sm")
def split_by_period_newline(text):
    # Split only where a period is directly followed by a newline character
    raw_sentences = re.split(r'\.\s*\n+', text)
    # Strip each sentence and remove empty ones
    sentences = [s.strip() for s in raw_sentences if s.strip()]
    return sentences


In [9]:
file_paths = {
    "../data/constitution.pdf": "Constitution",
    "../data/code civil.pdf": "code civil",
    "../data/code de travail.pdf": "code de travail",
    "../data/code penal.pdf": "code penal"

}

all_sentences = []

for path, source_name in file_paths.items():
    raw = extract_text_from_pdf(path)
    cleaned = remove_page_headers(raw)
    split_sentences = split_by_period_newline(cleaned)
    for s in split_sentences:
        if s.strip():
            all_sentences.append((s.strip(), source_name))


print(f"Total sentences: {len(all_sentences)}")

Total sentences: 32482


In [7]:
footer_keywords = [
    "Dernière modification",
    "Document généré",
    "Copyright",
    "Legifrance"
]

def is_footer(sentence):
    return any(keyword.lower() in sentence.lower() for keyword in footer_keywords)

# Filter out footer-like entries
all_sentences = [
    (s, source) for (s, source) in all_sentences
    if not is_footer(s)
]

In [8]:
texts = [s[0] for s in all_sentences]
sources = [s[1] for s in all_sentences]

In [10]:
nlp = spacy.load("fr_core_news_sm")

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

preprocessed_sentences = [preprocess(s) for s in texts]


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_sentences)

In [None]:
joblib.dump(preprocessed_sentences, "../models/preprocessed_sentences.pkl")
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")
joblib.dump(X, "../data/tfidf_matrix.pkl")
joblib.dump(texts, "../data/texts.pkl")
joblib.dump(sources, "../data/sources.pkl")

['../data/sources.pkl']

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def search(query, top_k=3):
#     query_clean = preprocess(query)
#     query_vec = vectorizer.transform([query_clean])
#     scores = cosine_similarity(query_vec, X)[0]
#     top_indices = np.argsort(scores)[::-1][:top_k]

#     results = []
#     for i in top_indices:
#         results.append({
#             "score": scores[i],
#             "text": texts[i],
#             "source": sources[i]
#         })
#     return results


In [None]:
# results = search("Que est l'hymne national de la france")

# for res in results:
#     print(f"[{res['score']:.4f}] (source: {res['source']})")
#     print(f"→ {res['text']}\n")
