In [13]:
"""EPUB Edition Comparison RAG System - Local Transformers Edition

Compare two editions of a book using hybrid RAG similarity search.

Dependencies:
    pip install "unstructured[epub]" qdrant-client transformers torch numpy
    
System requirements:
    apt install pandoc
    GPU: RTX 3070 (8GB VRAM) or better
"""

import os
import uuid
from typing import List, Dict

import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from unstructured.partition.auto import partition
from unstructured.chunking.basic import chunk_elements
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

qdrant_client = QdrantClient(":memory:")

print("Loading Jina CLIP v2 model (768-dim text embeddings)...")
model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
model.eval()

if torch.cuda.is_available():
    model = model.cuda()
    print("Model loaded on GPU")
else:
    print("Model loaded on CPU")

VECTOR_DIM = 768


def embed_text(texts: List[str]) -> List[List[float]]:
    """Generate embeddings using Jina CLIP v2 locally via Transformers."""
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=8192
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, truncate_dim=VECTOR_DIM)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    return embeddings.tolist()


def extract_and_chunk(file_path: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
    """Extract and chunk document using Unstructured."""
    print(f"Extracting from: {file_path}")
    try:
        elements = partition(filename=file_path, strategy="auto")
    except Exception as e:
        print(f"Error parsing (requires pandoc): {e}")
        return []
    
    chunks = chunk_elements(
        elements,
        max_characters=chunk_size * 5,
        new_after_n_chars=chunk_size * 4,
        overlap=overlap * 5,
        overlap_all=True
    )
    
    processed_chunks = []
    for idx, chunk in enumerate(chunks):
        metadata = chunk.metadata.to_dict() if hasattr(chunk.metadata, 'to_dict') else {}
        processed_chunks.append({
            'text': str(chunk),
            'chunk_idx': idx,
            'page_number': metadata.get('page_number'),
            'filename': metadata.get('filename', file_path),
            'id': f"chunk{idx}"
        })
    return processed_chunks


def create_collection(collection_name: str, vector_size: int = VECTOR_DIM):
    """Create Qdrant collection for storing embeddings."""
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )


def index_chunks(chunks: List[Dict], collection_name: str, edition_label: str, batch_size: int = 50):
    """Generate embeddings and index chunks in Qdrant."""
    if not chunks:
        print(f"No chunks to index for {edition_label}")
        return

    print(f"Generating embeddings for {len(chunks)} chunks...")
    all_points = []
    
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        texts = [chunk['text'] for chunk in batch]
        embeddings = embed_text(texts)
        
        points = [
            PointStruct(
                id=str(uuid.uuid4()),
                vector=emb,
                payload={**chunk, 'edition': edition_label}
            )
            for chunk, emb in zip(batch, embeddings)
        ]
        all_points.extend(points)
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    upload_batch_size = 100
    for i in range(0, len(all_points), upload_batch_size):
        qdrant_client.upsert(
            collection_name=collection_name,
            points=all_points[i:i + upload_batch_size]
        )
    
    print(f"Indexed {len(all_points)} chunks for {edition_label}")


def search_similar_chunks(query: str, collection_name: str, top_k: int = 5):
    """Search for chunks similar to query."""
    query_embedding = embed_text([query])[0]
    return qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )


def compare_editions(query: str, collection_name: str, top_k: int = 3) -> Dict:
    """Compare how a topic appears in both editions."""
    all_results = search_similar_chunks(query, collection_name, top_k * 2)
    
    ed1_results = [r for r in all_results if r.payload['edition'] == 'edition1'][:top_k]
    ed2_results = [r for r in all_results if r.payload['edition'] == 'edition2'][:top_k]
    
    differences = []
    for ed1_result in ed1_results:
        ed1_embedding = np.array(embed_text([ed1_result.payload['text']])[0])
        
        best_match = None
        best_similarity = -1
        
        for ed2_result in ed2_results:
            ed2_embedding = np.array(embed_text([ed2_result.payload['text']])[0])
            similarity = np.dot(ed1_embedding, ed2_embedding) / (
                np.linalg.norm(ed1_embedding) * np.linalg.norm(ed2_embedding)
            )
            
            if similarity > best_similarity:
                best_similarity = similarity
                best_match = ed2_result
        
        differences.append({
            'edition1': {
                'text': ed1_result.payload['text'],
                'page': ed1_result.payload.get('page_number'),
                'score': ed1_result.score
            },
            'edition2': {
                'text': best_match.payload['text'] if best_match else None,
                'page': best_match.payload.get('page_number') if best_match else None,
                'score': best_match.score if best_match else 0
            },
            'cross_similarity': float(best_similarity),
            'is_different': best_similarity < 0.85
        })
    
    return {
        'query': query,
        'differences': differences
    }


def display_comparison(results: Dict):
    """Print comparison results."""
    print(f"\n{'=' * 80}")
    print(f"Query: {results['query']}")
    print(f"{'=' * 80}\n")
    
    for idx, diff in enumerate(results['differences'], 1):
        status = 'DIFFERENT' if diff['is_different'] else 'SIMILAR'
        print(f"Match {idx} - {status} ({diff['cross_similarity']:.2%} similarity)")
        print(f"\nEdition 1:")
        print(f"  {diff['edition1']['text'][:200]}...")
        
        if diff['edition2']['text']:
            print(f"\nEdition 2:")
            print(f"  {diff['edition2']['text'][:200]}...")
        
        print("-" * 80)


def main(edition1_path: str, edition2_path: str):
    """Main workflow for comparing two editions."""
    edition1_path = os.path.expanduser(edition1_path)
    edition2_path = os.path.expanduser(edition2_path)
    
    if not os.path.exists(edition1_path) or not os.path.exists(edition2_path):
        print("Error: Files not found")
        print(f"  {edition1_path}")
        print(f"  {edition2_path}")
        return
    
    print("Processing documents...")
    chunks1 = extract_and_chunk(edition1_path)
    chunks2 = extract_and_chunk(edition2_path)
    
    if not chunks1 or not chunks2:
        print("Error: Failed to extract chunks")
        return
    
    print(f"Edition 1: {len(chunks1)} chunks")
    print(f"Edition 2: {len(chunks2)} chunks")
    
    print("\nCreating vector index...")
    collection_name = "book_comparison"
    create_collection(collection_name)
    
    index_chunks(chunks1, collection_name, "edition1")
    index_chunks(chunks2, collection_name, "edition2")
    
    print("\n" + "=" * 80)
    print("Ready for queries. Type 'quit' to exit.")
    print("=" * 80)
    
    while True:
        query = input("\nQuery: ").strip()
        if query.lower() in ['quit', 'exit', 'q']:
            break
        if query:
            try:
                results = compare_editions(query, collection_name)
                display_comparison(results)
            except Exception as e:
                print(f"Error: {e}")


if __name__ == "__main__":
    path1 = "~/workspace/Datasets/shadow-lb/2014 Martin Fowler - Refactoring_Recl.epub"
    path2 = "~/workspace/Datasets/shadow-lb/Refactoring Improving the Design of Existing Code, 2nd Edition by Martin Fowler.epub"
    
    main(path1, path2)

[2025-11-23 03:24:34] INFO configuration_clip.py:258: `text_config` is `None`. Initializing the `JinaCLIPTextConfig` with default values.
[2025-11-23 03:24:34] INFO configuration_clip.py:265: `vision_config` is `None`. initializing the `JinaCLIPVisionConfig` with default values.


Loading Jina CLIP v2 model (768-dim text embeddings)...


A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- transform.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.73G [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

configuration_xlm_roberta.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py: 0.00B [00:00, ?B/s]

modeling_xlm_roberta.py: 0.00B [00:00, ?B/s]

mha.py: 0.00B [00:00, ?B/s]

rotary.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py: 0.00B [00:00, ?B/s]

stochastic_depth.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- mha.py
- mlp.py
- block.py
- xlm_padding.py
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_lora.py
- modeling_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Model loaded on GPU
Processing documents...
Extracting from: /home/jovyan/workspace/Datasets/shadow-lb/2014 Martin Fowler - Refactoring_Recl.epub


  data file translations/en.yaml not found
  data file translations/en.yaml not found




Extracting from: /home/jovyan/workspace/Datasets/shadow-lb/Refactoring Improving the Design of Existing Code, 2nd Edition by Martin Fowler.epub


  data file translations/en.yaml not found
  data file translations/en.yaml not found




Edition 1: 357 chunks
Edition 2: 385 chunks

Creating vector index...
Generating embeddings for 357 chunks...


  qdrant_client.recreate_collection(


AttributeError: 'NoneType' object has no attribute 'shape'