# Post-Extraction NLP Pipeline

This notebook processes extracted PDF text into searchable chunks with embeddings and a simple retriever.

Pipeline
- Load latest extraction CSV from `output/`
- Clean + normalize text
- Deduplicate documents
- Chunk into overlapping segments
- Embed with TF‑IDF (offline) or Sentence-Transformers (optional)
- Fit a cosine Nearest Neighbors index
- Save artifacts and provide a search helper

In [1]:
# Imports (all are commonly available; Sentence-Transformers is optional)
import os, sys, re, json, math, glob, time, hashlib, pathlib, datetime as dt
from pathlib import Path
import numpy as np
import pandas as pd

# Optional deps
try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False
    
try:
    from sentence_transformers import SentenceTransformer
    SBERT_AVAILABLE = True
except ImportError:
    SBERT_AVAILABLE = False
    
# For TF-IDF and neighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

print(f"SpaCy available: {SPACY_AVAILABLE}")
print(f"Sentence-Transformers available: {SBERT_AVAILABLE}")

  from .autonotebook import tqdm as notebook_tqdm


SpaCy available: True
Sentence-Transformers available: True


In [2]:
# Configuration
OUTPUT_DIR = Path('output')
RESULTS_DIR = Path('results')
RESULTS_DIR.mkdir(exist_ok=True)

# Text processing params
MIN_TEXT_LENGTH = 50
CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

# Embedding choice: 'tfidf' or 'sbert'
EMBEDDING_METHOD = 'tfidf'

# If using SBERT
SBERT_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

print(f"Will use {EMBEDDING_METHOD} embeddings")

Will use tfidf embeddings


## Utility Functions

In [3]:
def clean_text(text):
    """Basic text cleaning and normalization."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Replace non-breaking spaces
    text = text.replace('\\xa0', ' ')
    
    # Normalize multiple newlines
    text = re.sub(r'\\n\\s*\\n+', '\\n\\n', text)
    
    # Remove excessive whitespace
    text = re.sub(r'[ \\t]+', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def normalize_text(text):
    """More aggressive normalization for deduplication."""
    if not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and extra spaces
    text = re.sub(r'[^\\w\\s]', ' ', text)
    text = re.sub(r'\\s+', ' ', text)
    
    return text.strip()

def simple_sentence_split(text):
    """Simple sentence splitting for chunking."""
    if not text:
        return []
    
    # Split on sentence boundaries
    parts = re.split(r'(?<=[.!?])\\s+(?=[A-ZÉÈÀÂÎÔÙ])', text)
    return [p.strip() for p in parts if p.strip()]

In [4]:
def text_hash(text):
    """Generate a hash for text deduplication."""
    normalized = normalize_text(text)
    return hashlib.md5(normalized.encode()).hexdigest()

def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    """Split text into overlapping chunks."""
    if not text or len(text) < chunk_size:
        return [text] if text else []
    
    # Try sentence-aware chunking first
    sentences = simple_sentence_split(text)
    
    if not sentences:
        # Fallback to character-based chunking
        chunks = []
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            if chunk.strip():
                chunks.append(chunk.strip())
        return chunks
    
    # Sentence-aware chunking
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + " "
        else:
            if current_chunk.strip():
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    
    return chunks

In [5]:
def latest_csv_in_output(pattern='*with_content*.csv'):
    """Find the most recent CSV file in output directory."""
    paths = sorted(OUTPUT_DIR.glob(pattern))
    if not paths:
        raise FileNotFoundError(f'No CSV found in {OUTPUT_DIR} matching {pattern}')
    return paths[-1]  # Most recent

def save_artifacts(chunks_df, embeddings, index, metadata):
    """Save all pipeline artifacts."""
    timestamp = dt.datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save chunks DataFrame
    chunks_path = RESULTS_DIR / f'chunks_{timestamp}.csv'
    chunks_df.to_csv(chunks_path, index=False)
    
    # Save embeddings
    embeddings_path = RESULTS_DIR / f'embeddings_{timestamp}.npy'
    np.save(embeddings_path, embeddings)
    
    # Save index
    import pickle
    index_path = RESULTS_DIR / f'index_{timestamp}.pkl'
    with open(index_path, 'wb') as f:
        pickle.dump(index, f)
    
    # Save metadata
    metadata_path = RESULTS_DIR / f'metadata_{timestamp}.json'
    with open(metadata_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    print(f"Artifacts saved with timestamp {timestamp}")
    return timestamp

## 1. Load and Clean Data

In [6]:
# Load the latest extraction CSV
csv_path = latest_csv_in_output('*with_content*.csv')
print(f"Loading: {csv_path}")

df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} documents")
print(f"Columns: {list(df.columns)}")

# Basic info
if 'extracted_text' in df.columns:
    text_col = 'extracted_text'
elif 'content' in df.columns:
    text_col = 'content'
else:
    text_col = df.columns[-1]  # Assume last column is text

print(f"Using text column: {text_col}")
print(f"Non-empty texts: {df[text_col].notna().sum()}")

Loading: output\pdf_extraction_results_20250830_131547_with_content.csv
Loaded 998 documents
Columns: ['file_name', 'file_path', 'file_size_kb', 'extraction_timestamp', 'content_type', 'page_count', 'extracted_text', 'has_text', 'has_images', 'image_count', 'word_count', 'char_count', 'paragraph_count', 'line_count', 'error']
Using text column: extracted_text
Non-empty texts: 997


In [7]:
# Clean and filter
print("Cleaning text...")
df['clean_text'] = df[text_col].apply(clean_text)

# Filter by minimum length
df['text_length'] = df['clean_text'].str.len()
df_filtered = df[df['text_length'] >= MIN_TEXT_LENGTH].copy()

print(f"After filtering (min {MIN_TEXT_LENGTH} chars): {len(df_filtered)} documents")
print(f"Average text length: {df_filtered['text_length'].mean():.0f} chars")
print(f"Median text length: {df_filtered['text_length'].median():.0f} chars")

Cleaning text...
After filtering (min 50 chars): 996 documents
Average text length: 3514 chars
Median text length: 2427 chars
After filtering (min 50 chars): 996 documents
Average text length: 3514 chars
Median text length: 2427 chars


## 2. Deduplicate

In [8]:
# Add text hashes for deduplication
print("Deduplicating...")
df_filtered['text_hash'] = df_filtered['clean_text'].apply(text_hash)

# Remove duplicates based on text hash
before_dedup = len(df_filtered)
df_dedup = df_filtered.drop_duplicates(subset=['text_hash']).copy()
after_dedup = len(df_dedup)

print(f"Removed {before_dedup - after_dedup} duplicate documents")
print(f"Remaining: {after_dedup} unique documents")

Deduplicating...
Removed 54 duplicate documents
Remaining: 942 unique documents
Removed 54 duplicate documents
Remaining: 942 unique documents


## 3. Create Chunks

In [9]:
# Chunk all documents
print(f"Chunking with size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP}...")

chunks_data = []
for idx, row in df_dedup.iterrows():
    text = row['clean_text']
    chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
    
    for chunk_idx, chunk in enumerate(chunks):
        chunk_data = {
            'doc_id': row.get('filename', f'doc_{idx}'),
            'chunk_id': f"{row.get('filename', f'doc_{idx}')}_{chunk_idx}",
            'chunk_text': chunk,
            'chunk_length': len(chunk),
            'chunk_index': chunk_idx
        }
        
        # Copy relevant metadata
        for col in ['filename', 'page_count', 'extraction_time']:
            if col in row:
                chunk_data[col] = row[col]
        
        chunks_data.append(chunk_data)

chunks_df = pd.DataFrame(chunks_data)
print(f"Created {len(chunks_df)} chunks from {len(df_dedup)} documents")
print(f"Average chunk length: {chunks_df['chunk_length'].mean():.0f} chars")

Chunking with size=512, overlap=50...
Created 942 chunks from 942 documents
Average chunk length: 3505 chars


## 4. Generate Embeddings

In [10]:
# Generate embeddings based on chosen method
print(f"Generating {EMBEDDING_METHOD} embeddings...")

if EMBEDDING_METHOD == 'tfidf':
    # TF-IDF embeddings
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words=None,  # Keep French stopwords if needed
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )
    
    embeddings = vectorizer.fit_transform(chunks_df['chunk_text']).toarray()
    embedding_model = vectorizer
    
elif EMBEDDING_METHOD == 'sbert' and SBERT_AVAILABLE:
    # Sentence-BERT embeddings
    model = SentenceTransformer(SBERT_MODEL)
    embeddings = model.encode(chunks_df['chunk_text'].tolist(), show_progress_bar=True)
    embedding_model = model
    
else:
    raise ValueError(f"Embedding method '{EMBEDDING_METHOD}' not available or supported")

print(f"Embeddings shape: {embeddings.shape}")

Generating tfidf embeddings...
Embeddings shape: (942, 5000)
Embeddings shape: (942, 5000)


## 5. Build Search Index

In [11]:
# Build nearest neighbors index
print("Building search index...")

# Use cosine similarity
index = NearestNeighbors(
    n_neighbors=min(10, len(chunks_df)),
    metric='cosine',
    algorithm='brute'  # Better for small datasets
)

index.fit(embeddings)
print("Search index built successfully")

Building search index...
Search index built successfully


## 6. Save Artifacts

In [12]:
# Prepare metadata
metadata = {
    'timestamp': dt.datetime.now().isoformat(),
    'source_csv': str(csv_path),
    'num_documents': len(df_dedup),
    'num_chunks': len(chunks_df),
    'embedding_method': EMBEDDING_METHOD,
    'embedding_dim': embeddings.shape[1],
    'chunk_size': CHUNK_SIZE,
    'chunk_overlap': CHUNK_OVERLAP,
    'min_text_length': MIN_TEXT_LENGTH
}

if EMBEDDING_METHOD == 'sbert':
    metadata['sbert_model'] = SBERT_MODEL

# Save everything
timestamp = save_artifacts(chunks_df, embeddings, index, metadata)
print("Pipeline completed successfully!")

Artifacts saved with timestamp 20250831_154154
Pipeline completed successfully!


## 7. Search Helper

In [13]:
def search_documents(query, k=5):
    """Search for relevant chunks using the trained index."""
    if EMBEDDING_METHOD == 'tfidf':
        query_embedding = embedding_model.transform([query]).toarray()
    elif EMBEDDING_METHOD == 'sbert':
        query_embedding = embedding_model.encode([query])
    
    # Find nearest neighbors
    distances, indices = index.kneighbors(query_embedding, n_neighbors=k)
    
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        chunk_info = chunks_df.iloc[idx]
        similarity = 1 - dist  # Convert distance to similarity
        
        results.append({
            'rank': i + 1,
            'similarity': similarity,
            'doc_id': chunk_info['doc_id'],
            'chunk_id': chunk_info['chunk_id'],
            'text': chunk_info['chunk_text'][:200] + '...' if len(chunk_info['chunk_text']) > 200 else chunk_info['chunk_text']
        })
    
    return results

# Test search
print("Testing search functionality...")
test_query = "emploi formation"
test_results = search_documents(test_query, k=3)

print(f"\\nSearch results for '{test_query}':")
for result in test_results:
    print(f"\\n{result['rank']}. Similarity: {result['similarity']:.3f}")
    print(f"Document: {result['doc_id']}")
    print(f"Text: {result['text']}")

Testing search functionality...
\nSearch results for 'emploi formation':
\n1. Similarity: 0.137
Document: doc_133
Text: DIRECTION DES RESSOURCES HUMAINES 

SERVICE EMPLOI, FORMATION ET PARCOURS PROFESSIONNELS 

SECTEUR RECRUTEMENT – MOBILITE – EVOLUTION PROFESSIONNELLE 

UN.E AGENT.E ADMINISTRATIF.VE ET D'ACCUEIL 

Cab...
\n2. Similarity: 0.121
Document: doc_882
Text: DIRECTION DES RESSOURCES HUMAINES 

SERVICE EMPLOI, FORMATION ET PARCOURS PROFESSIONNELS 

SECTEUR RECRUTEMENT – MOBILITE – EVOLUTION PROFESSIONNELLE 

LA VILLE DE METZ, 
118 634 habi an s, ville-cen ...
\n3. Similarity: 0.116
Document: doc_486
Text: DIRECTION DES RESSOURCES HUMAINES 

SERVICE EMPLOI, FORMATION ET PARCOURS PROFESSIONNELS 

SECTEUR RECRUTEMENT – MOBILITE – EVOLUTION PROFESSIONNELLE 

LA VILLE DE METZ, 
118 634 habi an s, ville-cen ...
