In [3]:
pip install PyMuPDF pdfminer.six typesense transformers 

Collecting pdfminer.six
  Using cached pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting typesense
  Using cached typesense-0.21.0-py3-none-any.whl.metadata (1.9 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Using cached cryptography-43.0.0-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Using cached pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
Using cached typesense-0.21.0-py3-none-any.whl (21 kB)
Using cached cryptography-43.0.0-cp39-abi3-win_amd64.whl (3.1 MB)
Installing collected packages: typesense, cryptography, pdfminer.six
Successfully installed cryptography-43.0.0 pdfminer.six-20240706 typesense-0.21.0
Note: you may need to restart the kernel to use updated packages.


In [19]:
import fitz  # PyMuPDF
import os
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np

# Download French stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trabe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
def extract_text_from_multiple_pdfs(folder_path):
    all_text_data = {}

    # Get all PDF files in the folder
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]

    for pdf_path in pdf_files:
        pdf_document = fitz.open(pdf_path)
        pdf_name = os.path.basename(pdf_path)
        text_data = []
        
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            text = page.get_text("text")
            text_data.append(text)
        
        all_text_data[pdf_name] = text_data
    
    return all_text_data

# Example usage
folder_path = "french"  # Path to your folder containing the PDFs
extracted_texts = extract_text_from_multiple_pdfs(folder_path)


In [15]:
def extract_keywords_tfidf(documents, top_n=5):
    # Initialize the French stopwords
    french_stopwords = list(set(stopwords.words('french')))  # Convert to list

    # Use TF-IDF Vectorizer to extract keywords
    tfidf_vectorizer = TfidfVectorizer(max_df=0.85, stop_words=french_stopwords, max_features=10000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Extract keywords for each document
    keywords_list = []
    for i in range(tfidf_matrix.shape[0]):
        tfidf_scores = tfidf_matrix[i].T.todense()
        tfidf_scores = np.array(tfidf_scores).flatten()
        top_keywords_indices = tfidf_scores.argsort()[-top_n:]
        top_keywords = [feature_names[j] for j in top_keywords_indices]
        keywords_list.append(top_keywords)
    
    return keywords_list

In [21]:
# Prepare the documents (all pages of all PDFs)
documents = []
for pdf_name, pages in extracted_texts.items():
    documents.extend(pages)

# Extract keywords using TF-IDF
keywords_list = extract_keywords_tfidf(documents)

# Display extracted keywords for each page
for i, keywords in enumerate(keywords_list):
    print(f"Document {i+1} Keywords: {', '.join(keywords)}")


Document 1 Keywords: déstructurer, œuvre, santé, guide, pratique
Document 2 Keywords: ministre, 2017f, hp5, cat, 660
Document 3 Keywords: durant, santé, guide, expérience, renseignements
Document 4 Keywords: favoriserez, additionnelles, croissance, collation, prénatale
Document 5 Keywords: légumes, privilégiez, matières, substituts, grasses
Document 6 Keywords: prénatale, poids, imc, livres, kg
Document 7 Keywords: comme, non, site, gc, pasteurisés
Document 8 Keywords: vertébrale, colonne, crâne, acide, folique
Document 9 Keywords: tube, neural, acide, folique, atn
Document 10 Keywords: santé, décelées, acide, folique, atn
Document 11 Keywords: bon, durant, ensemble, etcaf, alcool
Document 12 Keywords: bébé, sait, quantité, développement, alcool
Document 13 Keywords: biologique, boire, durant, alcool, etcaf
Document 14 Keywords: aide, durant, améliore, physique, activité
Document 15 Keywords: commencez, activités, augmentez, physique, activité
Document 16 Keywords: active, charges, hal

In [23]:
# Delete the collection if it already exists
try:
    client.collections['pdfs'].delete()
    print("Existing collection 'pdfs' deleted.")
except Exception as e:
    print(f"Could not delete collection: {e}")

# Define schema for Typesense collection
schema = {
    'name': 'pdfs',
    'fields': [
        {'name': 'pdf_name', 'type': 'string'},
        {'name': 'text', 'type': 'string'},
        {'name': 'keywords', 'type': 'string'},
        {'name': 'page_number', 'type': 'int32'},
        {'name': 'paragraph_number', 'type': 'int32'}
    ]
}

# Create the collection again
try:
    client.collections.create(schema)
    print("New collection 'pdfs' created.")
except Exception as e:
    print(f"Failed to create collection: {e}")


Existing collection 'pdfs' deleted.
New collection 'pdfs' created.


In [25]:
def index_multiple_pdfs_data_tfidf(extracted_texts, keywords_list):
    doc_idx = 0
    for pdf_name, pages in extracted_texts.items():
        for page_num, text in enumerate(pages):
            keywords = ', '.join(keywords_list[doc_idx])
            document = {
                'pdf_name': pdf_name,
                'text': text,
                'keywords': keywords,  # Ensure keywords are included
                'page_number': page_num,
                'paragraph_number': 0  # Assume 0 as we are not splitting paragraphs further here
            }
            client.collections['pdfs'].documents.create(document)
            doc_idx += 1

# Index the text data from all PDFs
index_multiple_pdfs_data_tfidf(extracted_texts, keywords_list)

print("Indexing completed.")


Indexing completed.


In [26]:
# Test by querying the indexed data
print("Testing by querying:")
try:
    results = client.collections['pdfs'].documents.search({
        'q': 'nutrition prénatale',
        'query_by': 'text,keywords'
    })
    for hit in results['hits']:
        print(f"PDF: {hit['document']['pdf_name']}, Page: {hit['document']['page_number']}")
        print("Keywords:", hit['document']['keywords'])
        print(hit['document']['text'][:200], '...')  # Display first 200 characters
        print("-" * 80)
except Exception as e:
    print(f"Query failed: {e}")

Testing by querying:
PDF: 64-03-16-1758-Sensible Guide to Healthy Pregnancy-FR-Web-final-v1.pdf, Page: 39
Keywords: prénatale, www, gc, ca, pcnp
Gouvernement du Canada  |  Le guide pratique d’une grossesse en santé 
38
Ressources
RESSOURCES
Grossesse en santé 
Canada.ca/sante
Activité physique 
www.santepublique.gc.ca/guideap
Guide des parents ...
--------------------------------------------------------------------------------
PDF: 64-03-16-1758-Sensible Guide to Healthy Pregnancy-FR-Web-final-v1.pdf, Page: 5
Keywords: prénatale, poids, imc, livres, kg
4
La nutrition prénatale
Gouvernement du Canada  |  Le guide pratique d’une grossesse en santé 
Consultez le site Internet de Santé Canada pour savoir 
comment choisir des poissons qui contiennent peu ...
--------------------------------------------------------------------------------
PDF: 64-03-16-1758-Sensible Guide to Healthy Pregnancy-FR-Web-final-v1.pdf, Page: 3
Keywords: favoriserez, additionnelles, croissance, collation, prénatale

In [33]:
import os
os.environ["TORCH_DYNAMO_DISABLE"] = "1"


In [35]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Test embedding extraction
try:
    text = "Votre texte en français ici."
    embedding = model.encode(text)
    print("Embedding shape:", embedding.shape)
except Exception as e:
    print("Error during embedding extraction:", e)

Embedding shape: (384,)


In [37]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import torch

# Load a French-compatible model
# Example using a Mistral model (adjust model path as needed)
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name)

# If using a LLaMA model or similar, you might load it like this:
# model_name = "path/to/llama/french/model"
# model = AutoModel.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

def get_embedding(text):
    # For Sentence-Transformers
    return model.encode(text, convert_to_tensor=True)

# Example usage
embedding = get_embedding("Votre texte en français ici.")
print(embedding.shape)  # Should return the shape of the embedding tensor


torch.Size([384])


In [47]:
def semantic_search(query, typesense_results):
    # Get the embedding of the query
    query_embedding = get_embedding(query)
    
    # Prepare the list to hold the results with their similarity scores
    search_results = []

    for hit in typesense_results['hits']:
        # Extract text from the Typesense result
        text = hit['document']['text']
        pdf_name = hit['document']['pdf_name']
        page_number = hit['document']['page_number']
        
        # Get the embedding of the document text
        text_embedding = get_embedding(text)
        
        # Compute the cosine similarity
        similarity = util.pytorch_cos_sim(query_embedding, text_embedding).item()
        
        # Store the result with the similarity score
        search_results.append((similarity, pdf_name, page_number, text))

    # Sort the results by similarity score in descending order
    search_results = sorted(search_results, key=lambda x: x[0], reverse=True)

    return search_results

# Perform Typesense search
print("Performing Typesense search...")
typesense_results = client.collections['pdfs'].documents.search({
    'q': 'nutrition prénatale',
    'query_by': 'text,keywords'
})

# Now perform semantic search using the Typesense results
query = "nutrition prénatale"
semantic_results = semantic_search(query, typesense_results)

# Display the top semantic search results
print(f"\nTop Semantic Search Results for '{query}':")
for similarity, pdf_name, page_number, text in semantic_results[:5]:  # Show top 5 results
    print(f"PDF: {pdf_name}, Page: {page_number + 1}, Similarity: {similarity:.4f}")
    print(text[:500], '...')  # Display first 200 characters of the text
    print("-" * 80)


Performing Typesense search...

Top Semantic Search Results for 'nutrition prénatale':
PDF: PNNS_grossesse.pdf, Page: 37, Similarity: 0.8558
37
Le guide nutrition 
de la grossesse
 ...
--------------------------------------------------------------------------------
PDF: 64-03-16-1758-Sensible Guide to Healthy Pregnancy-FR-Web-final-v1.pdf, Page: 4, Similarity: 0.7323
2
La nutrition prénatale
Gouvernement du Canada  |  Le guide pratique d’une grossesse en santé 
La nutrition prénatale
L’alimentation joue un rôle très important dans une grossesse en santé. Pour vous assurer d’obtenir 
les vitamines, minéraux et nutriments dont vous et votre enfant à naître avez besoin, vous devez 
consommer des aliments de différentes sources. En mangeant sainement, vous vous sentirez mieux, 
vous aurez plus d’énergie et vous favoriserez une prise de poids saine. Vous contribu ...
--------------------------------------------------------------------------------
PDF: 64-03-16-1758-Sensible Guide to Healthy