In [2]:
# Setup and Imports
import sys
import os
from pathlib import Path

# Add src to path - go up one directory from notebooks to project root
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
src_path = str(project_root / 'src')

if src_path not in sys.path:
    sys.path.insert(0, src_path)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import warnings
warnings.filterwarnings('ignore')

print(f"✓ Project root: {project_root}")
print(f"✓ Added to path: {src_path}")
print("✓ Imports configured")

✓ Project root: d:\Documents\INSA_Lyon\INSA 4A\TCD 1\CASML-Generative-AI-Hackathon
✓ Added to path: d:\Documents\INSA_Lyon\INSA 4A\TCD 1\CASML-Generative-AI-Hackathon\src
✓ Imports configured


## Step 1: Import Libraries

Import all necessary libraries for the RAG pipeline

In [1]:
# Core libraries
import numpy as np
import pandas as pd

# PDF Processing
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embeddings (using sentence-transformers, no TensorFlow needed)
from sentence_transformers import SentenceTransformer
from FlagEmbedding import FlagReranker

# Vector Store
import faiss

# LLM Model
import transformers

print("✓ All libraries imported successfully")

  from .autonotebook import tqdm as notebook_tqdm


✓ All libraries imported successfully


## Step 2: Load PDF and Extract TOC

1. Load tài liệu (PyPDFLoader tự động thêm 'page' vào metadata)

In [2]:
loader = PyPDFLoader("../data/raw/book.pdf")
documents = loader.load()

# Kiểm tra thử 1 trang xem có metadata chưa
documents[0].metadata

{'producer': 'Prince 14.2 (www.princexml.com)',
 'creator': 'PyPDF',
 'creationdate': '2022-02-24T09:25:50-06:00',
 'moddate': '2022-03-01T11:18:04-06:00',
 'title': 'Psychology 2e',
 'source': '../data/raw/book.pdf',
 'total_pages': 753,
 'page': 0,
 'page_label': '1'}

2. Chia nhỏ văn bản (Chunking)

In [3]:
# Chunk size không nên quá nhỏ để giữ ngữ cảnh, overlap giúp không bị cắt giữa câu
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,
    add_start_index=True
)

all_splits = text_splitter.split_documents(documents)

# Lưu ý: Khi split, LangChain tự động sao chép metadata (số trang) từ document gốc sang từng chunk.

In [4]:
len(all_splits)

2963

## Step 3: Load Embedding Model

Using BGE-large-en-v1.5 (sentence-transformers)

In [5]:
# Load BGE embedding model
model_emb_name = "BAAI/bge-large-en-v1.5"
embedding_model = SentenceTransformer(model_emb_name)

print(f"✓ Loaded embedding model: {model_emb_name}")
print(f"✓ Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")

✓ Loaded embedding model: BAAI/bge-large-en-v1.5
✓ Embedding dimension: 1024


## Step 4: Generate Embeddings for Chunks

Encode all text chunks using BGE model

In [6]:
# Extract text from chunks
chunk_texts = [doc.page_content for doc in all_splits]

print(f"Generating embeddings for {len(chunk_texts)} chunks...")
print("This may take a few minutes...")

# Generate embeddings with BGE
# BGE recommends adding instruction for queries, but not for passages
embeddings = embedding_model.encode(
    chunk_texts,
    normalize_embeddings=True,  # Normalize for cosine similarity
    show_progress_bar=True,
    batch_size=32
)

print(f"✓ Generated embeddings shape: {embeddings.shape}")

Generating embeddings for 2963 chunks...
This may take a few minutes...


Batches: 100%|██████████| 93/93 [04:54<00:00,  3.16s/it]

✓ Generated embeddings shape: (2963, 1024)





## Step 5: Build FAISS Index

Create vector index for fast similarity search

In [7]:
# Get embedding dimension
embedding_dim = embeddings.shape[1]

# Create FAISS index (Inner Product for normalized vectors = Cosine Similarity)
index = faiss.IndexFlatIP(embedding_dim)

# Add embeddings to index
index.add(embeddings.astype('float32')) # Reduced precision for FAISS

print(f"✓ FAISS index created")
print(f"  Dimension: {embedding_dim}")
print(f"  Total vectors: {index.ntotal}")

✓ FAISS index created
  Dimension: 1024
  Total vectors: 2963


## Step 6: Test Retrieval

Search for relevant documents

In [8]:
# Test query
test_query = "What are the contributions made by Freud in psychology?"

# For BGE, add instruction prefix for queries
query_instruction = "Represent this sentence for searching relevant passages: "
query_with_instruction = query_instruction + test_query

# Encode query
query_embedding = embedding_model.encode(
    [query_with_instruction],
    normalize_embeddings=True
)

# Search in FAISS
k = 5  # Top-5 results
distances, indices = index.search(query_embedding.astype('float32'), k)

print(f"Query: {test_query}\n")
print(f"Top {k} results:")
print("="*70)

for i, (idx, score) in enumerate(zip(indices[0], distances[0])):
    print(f"\n[{i+1}] Score: {score:.4f}")
    print(f"Page: {all_splits[idx].metadata.get('page', 'N/A')}")
    print(f"Content: {chunk_texts[idx][:200]}...")

Query: What are the contributions made by Freud in psychology?

Top 5 results:

[1] Score: 0.6960
Page: 373
Content: • Define and describe the nature and function of the id, ego, and superego
• Define and describe the defense mechanisms
• Define and describe the psychosexual stages of personality development
Sigmund...

[2] Score: 0.6930
Page: 22
Content: the unconscious mind could be accessed through dream analysis, by examinations of the first words that came
to people’s minds, and through seemingly innocent slips of the tongue. Psychoanalytic theory...

[3] Score: 0.6880
Page: 22
Content: FIGURE 1.3 William James, shown here in a self-portrait, was the first American psychologist.
Freud and Psychoanalytic Theory
Perhaps one of the most influential and well-known figures in psychology’s...

[4] Score: 0.6827
Page: 692
Content: Freud, S. (1920). Resistance and suppression. A general introduction to psychoanalysis (pp. 248–261). Horace
Liveright.
Freud, S. (1923/1949). The ego and the 

## Step 7: Load Reranker Model

Use FlagReranker to improve retrieval quality

In [9]:
# Load reranker
reranker_model = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)

print(f"✓ Loaded reranker: BAAI/bge-reranker-v2-m3")

✓ Loaded reranker: BAAI/bge-reranker-v2-m3


## Step 8: Retrieval with Reranking

Retrieve more candidates, then rerank for better results

In [10]:
# Retrieve more candidates for reranking
k_faiss = 50  # Retrieve 50 candidates
k_reRanker = 5     # Return top 5 after reranking

# Search
distances, indices = index.search(query_embedding.astype('float32'), k_faiss)

# Prepare pairs for reranking: [[query, passage1], [query, passage2], ...]
pairs = [[test_query, chunk_texts[idx]] for idx in indices[0]]

# Rerank
rerank_scores = reranker_model.compute_score(pairs, normalize=True)

# Sort by rerank scores
ranked_results = sorted(
    zip(indices[0], rerank_scores),
    key=lambda x: x[1],
    reverse=True
)[:k_reRanker]

print(f"Query: {test_query}\n")
print(f"Top {k_reRanker} results (after reranking):")
print("="*60)

for i, (idx, score) in enumerate(ranked_results):
    print(f"\n[{i+1}] Rerank Score: {score:.4f}")
    print(f"Page: {all_splits[idx].metadata.get('page', 'N/A')}")
    print(f"Content: {chunk_texts[idx][:200]}...")

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Query: What are the contributions made by Freud in psychology?

Top 5 results (after reranking):

[1] Rerank Score: 0.8440
Page: 295
Content: nurturance and parenting during a stage, we may become stuck, or fixated, in that stage. Freud’s stages are
called the stages of psychosexual development. According to Freud, children’s pleasure-seeki...

[2] Rerank Score: 0.7160
Page: 23
Content: interactions, and the development of personality over time. Westen identifies subsequent research support for
all of these ideas.
More modern iterations of Freud’s clinical approach have been empirica...

[3] Rerank Score: 0.5871
Page: 33
Content: people in this country will be 65 or older (Department of Health and Human Services, n.d.).
Personality Psychology
Personality psychology focuses on patterns of thoughts and behaviors that make each i...

[4] Rerank Score: 0.5460
Page: 380
Content: negative dynamics of the youngest and oldest children. Despite popular attention, research has not
conclusively c

# Step 9 : LLM Model (LLama)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load LLM model and tokenizer
# Using Qwen2.5 (open-source, no gating, good performance)
model_llm_name = "Qwen/Qwen2.5-1.5B-Instruct"
print(f"Loading LLM model: {model_llm_name}...")

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_llm_name)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_llm_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

print(f"✓ LLM model loaded successfully on {device}")
print(f"✓ Model size: ~1.5B parameters")

Loading LLM model: Qwen/Qwen2.5-1.5B-Instruct...
Using device: cuda


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
Some parameters are on the meta device because they were offloaded to the cpu and disk.
Some parameters are on the meta device because they were offloaded to the cpu and disk.


✓ LLM model loaded successfully on cuda
✓ Model size: ~1.5B parameters


In [13]:
def generate_answer(query: str, contexts: list, max_length: int = 512) -> str:
    """
    Generate answer using LLM based on query and retrieved contexts.
    
    Args:
        query: User question
        contexts: List of relevant context strings
        max_length: Maximum tokens for generation
    
    Returns:
        Generated answer string
    """
    # Format prompt with contexts
    context_text = "\n\n".join([f"Context {i+1}:\n{ctx}" for i, ctx in enumerate(contexts)])
    
    prompt = f"""Using the following contexts, answer the question concisely and accurately.

{context_text}

Question: {query}

Answer:"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode and extract answer
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_response.split("Answer:")[-1].strip()
    
    return answer

# Test generation
test_contexts = [chunk_texts[idx] for idx in indices[0][:3]]
test_answer = generate_answer(test_query, test_contexts)
print(f"Query: {test_query}")
print(f"\nGenerated Answer:\n{test_answer}")

Query: What are the contributions made by Freud in psychology?

Generated Answer:
Sigmund Freud contributed significantly to psychology's understanding of the human psyche through his theories on the id, ego, and superego, defense mechanisms, and psychosexual stages of personality development. He proposed that the unconscious mind plays a critical role in shaping behavior and emotions, advocating for techniques like dream analysis and treatment methods based on his psychoanalytic theory. His work influenced clinical psychology and remains relevant today.


## Step 10: Process Multiple Queries

Load queries from JSON and generate submission

In [14]:
import json
import csv
from tqdm import tqdm

# Load queries
queries_file = "../data/raw/queries.json"
output_file = "../data/outputs/submission.csv"

print(f"Loading queries from {queries_file}...")
with open(queries_file, 'r', encoding='utf-8') as f:
    queries = json.load(f)

print(f"✓ Loaded {len(queries)} queries")
print(f"Processing queries and generating answers...")

# Create/overwrite CSV with header
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "context", "answer", "references"])

# Process each query
for query_data in tqdm(queries, desc="Processing queries"):
    query_id = query_data["query_id"]
    question = query_data["question"]
    
    # Step 1: Encode query with BGE instruction
    query_instruction = "Represent this sentence for searching relevant passages: "
    query_with_instruction = query_instruction + question
    query_embedding = embedding_model.encode(
        [query_with_instruction],
        normalize_embeddings=True
    )
    
    # Step 2: Retrieve candidates from FAISS
    k_candidates = 50
    distances, indices = index.search(query_embedding.astype('float32'), k_candidates)
    
    # Step 3: Rerank with FlagReranker
    pairs = [[question, chunk_texts[idx]] for idx in indices[0]]
    rerank_scores = reranker_model.compute_score(pairs, normalize=True)
    
    # Get top 5 after reranking
    k_final = 5
    ranked_results = sorted(
        zip(indices[0], rerank_scores),
        key=lambda x: x[1],
        reverse=True
    )[:k_final]
    
    # Step 4: Get contexts and generate answer
    top_contexts = [chunk_texts[idx] for idx, score in ranked_results]
    answer = generate_answer(question, top_contexts, max_length=256)
    
    # Step 5: Get references (page numbers)
    references = [all_splits[idx].metadata.get('page', 'N/A') for idx, score in ranked_results]
    references_str = ','.join(map(str, references))
    
    # Step 6: Combine contexts
    context_combined = "\n\n".join(top_contexts)
    
    # Step 7: Save to CSV
    with open(output_file, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([query_id, context_combined, answer, references_str])

print(f"\n✓ Processing complete!")
print(f"✓ Saved results to: {output_file}")
print(f"✓ Total queries processed: {len(queries)}")

Loading queries from ../data/raw/queries.json...
✓ Loaded 50 queries
Processing queries and generating answers...


Processing queries:   0%|          | 0/50 [00:00<?, ?it/s]

Processing queries: 100%|██████████| 50/50 [1:56:58<00:00, 140.37s/it]


✓ Processing complete!
✓ Saved results to: ../data/outputs/submission.csv
✓ Total queries processed: 50





## Summary

Pipeline completed with:
1. ✅ PDF loading with LangChain PyPDFLoader
2. ✅ Recursive text chunking (1000 chars, 200 overlap)
3. ✅ BGE embeddings (BAAI/bge-large-en-v1.5)
4. ✅ FAISS vector indexing
5. ✅ Semantic search with cosine similarity
6. ✅ FlagReranker (BAAI/bge-reranker-v2-m3)
7. ✅ Two-stage retrieval: FAISS → Reranking

**Next Steps:**
- Add LLM for answer generation
- Implement TOC extraction for references
- Create batch processing for multiple queries
- Save results to CSV for Kaggle submission