# Warhammer RAG

## Installation

In [1]:
!python -m pip install --upgrade pip
!pip install llama-cpp-python
! pip install cerebras-cloud-sdk
!python -m spacy download pl_core_news_lg
!pip install langchain langchain-community sentence-transformers chromadb
!pip install pypdf requests pydantic tqdm
!pip install rank_bm25
#!pip install flash-attn --no-build-isolation

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.14.tar.gz (51.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading dis

## Imports

In [74]:
import os
import re
import time
import uuid
import spacy
import torch
import openai
import requests
import tiktoken
import chromadb
import numpy as np

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
from rank_bm25 import BM25Okapi
from collections import defaultdict
from chromadb.config import Settings
from kaggle_secrets import UserSecretsClient
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
from cerebras.cloud.sdk import Cerebras
from spacy.tokens import Doc
from typing import List
from tqdm import tqdm

## load secret keys

In [None]:
user_secrets = UserSecretsClient()
cerebras_key = user_secrets.get_secret("Cerebras_ai_api")
openrouter_key = user_secrets.get_secret("OPENROUTER_KEY")

## Functions

In [51]:
# load our document
def load_document(doc_path: str):
    loader = PyPDFLoader(data_path)
    wh_rulebook = loader.load()
    wh_rulebook_content = [content.page_content for content in wh_rulebook]
    #wh_rulebook_raw = ''
    #wh_rulebook_raw = ' '.join(wh_rulebook_content)
    return wh_rulebook_content

# split our data into chunks (one chunk - one page)
def smart_overlap_pages(pages, overlap_chars: int = 200):
    overlapped_chunks = []
    for i in range(len(pages)):
        prev = pages[i - 1] if i > 0 else ""
        next = pages[i + 1] if i < len(pages) - 1 else ""

        prev_overlap = prev[-overlap_chars:] if len(prev) > overlap_chars else prev
        next_overlap = next[:overlap_chars] if len(next) > overlap_chars else next

        chunk = prev_overlap + "\n" + pages[i] + "\n" + next_overlap
        overlapped_chunks.append(chunk)
    return overlapped_chunks

# initialize our embeddings model
def initialize_model(model_name: str):
    # encoder - initialize our encoder to create embedings
    # sdadas/mmlw-roberta-base - 124m par
    # sdadas/mmlw-retrieval-roberta-large-v2 435m par

    model = SentenceTransformer(
        model_name,
        trust_remote_code=True,
        device=None,
        model_kwargs={"trust_remote_code": True}
        #model_kwargs={"attn_implementation": "flash_attention_2", "trust_remote_code": True} Not implemented yet for roberta base
    )
    model.bfloat16
    return model

# create embedings for our document
def create_embedings(model, wh_content_chunked):
    query_prefix = "zapytanie: "
    answer_prefix = ""
    wh_embedings = []

    for content in wh_content_chunked:
        queries = [query_prefix + content]
        encode = model.encode(queries, show_progress_bar=False)
        wh_embedings.append(encode)

    wh_embedings = torch.tensor(wh_embedings, dtype=torch.bfloat16)
    wh_embedings = wh_embedings.squeeze(1)
    return wh_embedings

# split our text into max size chunks
def chunk_text(text, chunk_size=100000):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [None]:
pattern = r"Wprowadzenie"
start = re.finditer(pattern, wh_rulebook_raw)

wh_rulebook_raw[start-10:start+1000]

## Vector Database

In [7]:
class VectorStore:
    def __init__(self, collection_name="warhammer fantasy", persist_directory="./chroma_db"):
        """Initialize ChromaDB vector store"""
        self.client = chromadb.PersistentClient(path=persist_directory)
        
        # Create or get collection
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}  # Use cosine similarity
        )
    
    def add_documents(self, chunks, embeddings):
        """Add document chunks to vector store"""
        ids = [str(uuid.uuid4()) for _ in range(len(chunks))]
        
        self.collection.add(
            documents=[chunk for chunk in chunks],
            embeddings=embeddings.tolist(),
            ids=ids
        )
    
    def similarity_search(self, query_embedding, k=5, filter_dict=None):
        """Search for similar documents"""
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=k,
            where=filter_dict
        )
        
        return results

## *Additional **optional** steps

### Hybrid retriever

Combine cos similarity method with additional one which check words frequency in our query(BM25) to choose best available chunks

In [81]:
# optional - additional retrievel to improve our performance

class HybridRetriever:
    def __init__(self, vector_store, embedding_model):
        self.vector_store = vector_store
        self.embedding_model = embedding_model
        self.bm25 = None
        self.documents = []
    
    def setup_bm25(self, documents):
        """Setup BM25 for keyword-based retrieval"""
        self.documents = documents
        tokenized_docs = [self._tokenize(doc) for doc in documents]
        self.bm25 = BM25Okapi(tokenized_docs)
    
    def _tokenize(self, text):
        """Simple tokenization for BM25"""
        return re.findall(r'\b\w+\b', text.lower())
    
    def retrieve(self, query, k=10, alpha=0.7):
        """
        Hybrid retrieval combining semantic and keyword search
        
        Args:
            query: User query
            k: Number of documents to retrieve
            alpha: Weight for semantic search (1-alpha for BM25)
        """
        # Semantic search
        query = "Zapytanie: {query}"
        query_embedding = self.embedding_model.encode(query, show_progress_bar=False)
        semantic_results = self.vector_store.similarity_search(query_embedding, k=k*2)
        
        # BM25 search
        if self.bm25:
            tokenized_query = self._tokenize(query)
            bm25_scores = self.bm25.get_scores(tokenized_query)
            bm25_results = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)[:k*2]
        else:
            bm25_results = []
        
        # Combine and rerank results
        combined_results = self._combine_results(semantic_results, bm25_results, alpha)
        
        return combined_results[:k]
    
    def _combine_results(self, semantic_results, bm25_results, alpha):
        """
        Combine semantic and BM25 results with weighted scoring using document index.

        Args:
            semantic_results: List of document objects from vector store
            bm25_results: List of (index, score) tuples from BM25
            alpha: Weight for semantic scores

        Returns:
            List of combined documents reranked by hybrid score
        """
        combined_scores = defaultdict(float)

        # Normalize BM25 scores
        if bm25_results:
            bm25_values = np.array([score for _, score in bm25_results])
            bm25_norm = (bm25_values - bm25_values.min()) / (bm25_values.ptp() + 1e-9)
            for (idx, _), score in zip(bm25_results, bm25_norm):
                combined_scores[idx] += (1 - alpha) * score

        # Normalize semantic scores
        semantic_values = np.array([doc.score if hasattr(doc, "score") else 1.0 for doc in semantic_results])
        semantic_norm = (semantic_values - semantic_values.min()) / (semantic_values.ptp() + 1e-9)
        for idx, (doc, score) in enumerate(zip(semantic_results, semantic_norm)):
            combined_scores[idx] += alpha * score

        # Rerank based on combined scores
        ranked_indices = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Return document objects using indices from semantic_results
        combined_docs = [semantic_results[idx] for idx, _ in ranked_indices if idx < len(semantic_results)]
        
        return combined_docs

In [10]:
bm = BM25Okapi(wh_rulebook_overlap)
test = wh_rulebook_overlap[25]
bm25_scores = bm.get_scores(test)
bm25_results = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)[:10*2]
bm25_results

[(0, -2398.235319707179),
 (353, -3651.9779336074557),
 (1, -4414.512119553013),
 (265, -4445.269298931796),
 (345, -4662.766154168548),
 (352, -4734.125405567533),
 (348, -4734.689860508153),
 (350, -4738.656630040257),
 (347, -4739.911627819456),
 (351, -4740.795987552746),
 (17, -4742.711883337711),
 (2, -4744.965502502055),
 (349, -4748.159446805872),
 (19, -4754.374965831195),
 (18, -4756.551181832996),
 (346, -4757.109619757759),
 (16, -4757.811666166828),
 (9, -4761.3473056319735),
 (13, -4769.642190235066),
 (3, -4772.364365148799)]

### API tests

In [None]:
client = Cerebras(
    # This is the default and can be omitted
    api_key=cerebras_key
)

stream = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "What is the meaning of life?"
        }
    ],
    model="llama-3.3-70b",
    stream=True,
    max_completion_tokens=10240,
    temperature=0.7,
    top_p=1
)

for chunk in stream:
  print(chunk.choices[0].delta.content or "", end="")

In [None]:
# API Tests
# gemini - 6.5
# kimi - 5.4
# R1 - 21.8
# V3 - 12.7
# chimera - 35.6
# qwen - 29.2

models = {'r1':'deepseek/deepseek-r1-0528:free',
         'v3':'deepseek/deepseek-chat-v3-0324:free',
         'chimera':'tngtech/deepseek-r1t2-chimera:free',
         'kimi':'moonshotai/kimi-k2:free',
         'gemini':'google/gemini-2.0-flash-exp:free',
         'qwen': 'qwen/qwq-32b:free'
}

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=openrouter_key,
)

completion = client.chat.completions.create(
  extra_headers={
    "HTTP-Referer": "<YOUR_SITE_URL>", # Optional. Site URL for rankings on openrouter.ai.
    "X-Title": "<YOUR_SITE_NAME>", # Optional. Site title for rankings on openrouter.ai.
  },
  extra_body={},
  model = models['qwen'],
  messages=[
    {
      "role": "user",
      "content": "What is the meaning of life?"
    }
  ]
)
print(completion.choices[0].message.content)

### Entities extractor

In [17]:
# check for entities in our doc
nlp = spacy.load("pl_core_news_lg")  # or "md", "lg" for larger models

chunks = chunk_text(wh_rulebook_raw)
docs = [nlp(chunk) for chunk in chunks]
combined_doc = Doc.from_docs(docs)
entities = [(ent.text, ent.label_) for ent in combined_doc.ents]

### Query enchancer

Improve our question by generating additional querys which will help model to answer our question.  
Additionaly it check if some popular entities are not present in our query

In [None]:
class QueryEnhancer:
    def __init__(self, api_key):
        self.client = Cerebras(
        api_key=api_key
        )
    
    def expand_warhammer_query(self, query: str) -> List[str]:
        """Generate multiple query variations for better retrieval"""
        prompt = f"""
        Biorąc pod uwagę zapytanie dotyczące Warhammer Fantasy: '{query}'
        
        Wygeneruj 3 alternatywne sformułowania, które mogą pomóc w znalezieniu odpowiednich informacji:
        1. Bardziej szczegółowa wersja z terminologią Warhammer Fantasy
        2. Szersza wersja obejmująca powiązane pojęcia
        3. Wersja skupiająca się na zasadach/mechanice, jeśli ma to zastosowanie
        
        Zwróć tylko 3 zapytania, po jednym w każdym wierszu.
        """

        response = self.client.chat.completions.create(
        extra_body={},
        model = 'llama-3.3-70b',
        max_completion_tokens=10240,
        temperature=0.7,
        top_p=1,
        messages=[
            {
            "role": "user",
            "content": prompt
            }
        ]
        )
        
        variations = response.choices[0].message.content.strip().split('\n')
        return [query] + [v.strip() for v in variations if v.strip()]
    
    def extract_entities(self, query: str) -> dict:
        """Extract Warhammer 40k entities from query"""
        warhammer_entities = {
            'factions': ['space marines', 'orks', 'eldar', 'tau', 'necrons', 'chaos'],
            'unit_types': ['infantry', 'vehicle', 'monster', 'character'],
            'weapons': ['bolter', 'lasgun', 'plasma', 'melta'],
            'rules': ['armor save', 'weapon skill', 'ballistic skill']
        }
        
        found_entities = {}
        query_lower = query.lower()
        
        for category, entities in warhammer_entities.items():
            found = [entity for entity in entities if entity in query_lower]
            if found:
                found_entities[category] = found
        
        return found_entities

### Context ranker

In [47]:
class ContextRanker:
    def __init__(self, model_name="radlab/polish-cross-encoder"):
        """Initialize cross-encoder for reranking"""
        self.reranker = CrossEncoder(model_name)
    
    def rerank_contexts(self, query: str, contexts: List[str], top_k: int = 5) -> List[dict]:
        """Rerank retrieved contexts using cross-encoder"""
        # Create query-context pairs
        pairs = [(query, context) for context in contexts]
        
        # Get relevance scores
        scores = self.reranker.predict(pairs)
        
        # Sort by score and return top_k
        scored_contexts = [
            {'context': context, 'score': score}
            for context, score in zip(contexts, scores)
        ]
        
        return sorted(scored_contexts, key=lambda x: x['score'], reverse=True)[:top_k]
    
    def filter_by_relevance(self, ranked_contexts: List[dict], threshold: float = 0.3) -> List[dict]:
        """Filter contexts by relevance threshold"""
        return [ctx for ctx in ranked_contexts if ctx['score'] > threshold]

## RAG

In [55]:
class WarhammerRAG:
    def __init__(self, retriever, ranker, llm_client):
        self.retriever = retriever
        self.ranker = ranker
        self.llm_client = llm_client
    
    def generate_response(self, query: str, max_context_length: int = 10240) -> dict:
        """Generate response using RAG pipeline"""
        
        # 1. Retrieve relevant contexts
        retrieved_contexts = self.retriever.retrieve(query, k=10)
        
        # 2. Rerank contexts
        context_texts = [ctx['document'] for ctx in retrieved_contexts]
        ranked_contexts = self.ranker.rerank_contexts(query, context_texts, top_k=5)
        
        # 3. Select contexts within token limit
        selected_contexts = self._select_contexts_by_length(ranked_contexts, max_context_length)
        
        # 4. Generate response
        response = self._generate_with_context(query, selected_contexts)
        
        return {
            'response': response,
            'sources': selected_contexts,
            'retrieved_count': len(retrieved_contexts)
        }
    
    def _select_contexts_by_length(self, contexts: List[dict], max_length: int) -> List[dict]:
        """Select contexts that fit within token limit"""
        selected = []
        current_length = 0
        
        for ctx in contexts:
            ctx_length = len(ctx['context'])  # Simplified length calculation
            if current_length + ctx_length <= max_length:
                selected.append(ctx)
                current_length += ctx_length
            else:
                break
        
        return selected
    
    def _generate_with_context(self, query: str, contexts: List[dict]) -> str:
        """Generate response using LLM with retrieved context"""
        
        context_text = "\n\n".join([f"[Source {i+1}]: {ctx['context']}" 
                                   for i, ctx in enumerate(contexts)])
        
        prompt = f"""
        Jesteś asystentem eksperta ds. Warhammera fantasy. Wykorzystaj podany kontekst, aby dokładnie odpowiedzieć na pytanie użytkownika.

        Kontekst z materiałów fantastyki Warhammer: {context_text}
        Pytanie użytkownika: {query}

        Instrukcje:
        - Opieraj swoją odpowiedź przede wszystkim na podanym kontekście
        - Jeśli kontekst nie zawiera wystarczających informacji, powiedz to jasno
        - Używaj odpowiedniej terminologii Warhammer Fantasy
        - W razie potrzeby uwzględnij odpowiednie zasady, statystyki lub wiedzę
        - Podaj źródła, do których się odwołujesz (np. „Według źródła 1...”)

        Odpowiedź:
        """
        
        response = self.llm_client.chat.completions.create(
            extra_body={},
            model = 'llama-3.3-70b',
            max_completion_tokens=10240,
            temperature=0.7,
            top_p=1,
        )
        
        return response.choices[0].message.content

## Pipeline

In [82]:
# define our querry
query = 'Podaj mi kilka interesujących pomysłów na stworzenie postaci uczonego studenta do sesji RPG w świecie Warhammer Fantasy.'
query = 'Czy w grze RPG warhammer fantasy 4e mogę korzystać z dwóch kusz jednocześnie?'

'''
# load document
doc_path = '/kaggle/input/warhammer-4e-rpg/WFRP_4_ed_PL_1.3.pdf'
wh_content = load_document(doc_path)
print("Document loaded - 1/9")

# split our document into chunks
wh_content_chunked = smart_overlap_pages(wh_content)
print("Document chunked - 2/9")

# initialize our embedding model and create emebeddings for our document
model = initialize_model("sdadas/mmlw-roberta-base")
wh_embedings = create_embedings(model, wh_content_chunked)
print("Embeddings prepared - 3/9")

# create our vector database and add our document into it
vector_store = VectorStore("warhammer_fantasy")
vector_store.add_documents(wh_content_chunked, wh_embedings)
print("Vector store ready - 4/9")
'''
# Initialize our hybrid retriever for better content picking
retriever = HybridRetriever(vector_store, model)
retriever.setup_bm25(wh_rulebook_overlap)
print("Retriever initialized - 5/9")

# generate additional context for our RAG
Enhancer = QueryEnhancer(api_key=cerebras_key)
query_enhanced = Enhancer.expand_warhammer_query(query)
print("Enhanced query generated - 6/9")

# 6. Setup reranker
ranker = ContextRanker()
print("Ranker initlialized - 7/9")

# 7. Setup LLM client
llm_client = Cerebras(api_key=cerebras_key)
print("LLM Loaded - 8/9")

# 8. Create RAG system
rag_system = WarhammerRAG(retriever, ranker, llm_client)
print("RAG ready - 9/9")

Retriever initialized - 5/9
Enhanced query generated - 6/9
Ranker initlialized - 7/9
LLM Loaded - 8/9
RAG ready - 9/9


In [67]:
query_enhanced

['Czy w grze RPG warhammer fantasy 4e mogę korzystać z dwóch kusz jednocześnie?',
 'Czy w Warhammer Fantasy 4e można używać dwóch kuszy jako podwójne bronie dystansowe?',
 'Czy istnieją ograniczenia dotyczące korzystania z wielu broni palnych w grach RPG?',
 'Jaki jest mechanizm obsługi broni podwójnych w Warhammer Fantasy 4e w kontekście kuszy?']

In [83]:
rag_system = WarhammerRAG(retriever, ranker, llm_client)
response = rag_system.generate_response(query_enhanced)
print(response)

KeyError: 0

In [None]:
# Split data into chunks

separators = [
r"\n.*?•\n", # New chapter
r"\n.*?\n",  # New topic
r"\n\n",     # Paragraphs
r"\n",       # Lines
r".",        # Sentences
]

#encoding = tiktoken.get_encoding("cl100k_base")
encoding = model

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=200,
    #separators=separators,
    length_function=lambda x: len(encoding.encode("zapytanie: {x}", convert_to_tensor=True, show_progress_bar=False)),
    #is_separator_regex=True,
    #keep_separator=True
)

chunks = text_splitter.split_text(wh_rulebook_raw)

In [None]:
('page_content', 'WARHAMMER FANTASY ROLEPLAY\n6')

In [107]:
import re

pattern = r"\n.*?\n"
text = "Some text\nRandom content\nAnother section\nMore content\nFinal part."

matches = re.findall(pattern, text)
print(matches)  # Check if it correctly identifies separators


['\nRandom content\n', '\nMore content\n']


## Optimal parameters search

In [None]:
# TODO
# check optimal chunk size
# check optimal chunk overlap
# check best separators

In [15]:
len(chunks)

801

In [None]:
('page_content', '•\nA zatem, czego tu szukasz? Przygody?\nByć może. Złota? Zapewne. Sprawiedliwości?\nHa, to dość względne pojęcie! Cóż to? Świętoszkowaty błysk w twym oku? A, chcesz robić \nto, co jest właściwe... Dopóki jest to dobrze płatne, dostarcza Ci rozrywki i pasuje do Twoich \npoglądów. Niech będzie, to wystarczy. Nadasz się. Wejdź, opowiem Ci o tej robocie.\nGdy wymagany jest rezultat losowy, gra WFRP korzysta \nz dziesięciościennej kostki. Dziesięciościenne kostki zazwyczaj \nmają ścianki oznakowane od 0 do 9, gdzie rzut 0 liczy się jako \n10. W zasadach takie kości określane są jako k10, a ich liczba, \nktórą trzeba rzucić, zawsze jest podana w następujący sposób: \n1k10 za jedną kostkę, 2k10 za dwie kostki, 3k10 za trzy kostki \ni tak dalej.\nJeśli należy rzucić kilkoma kośćmi, wyniki są zawsze \nsumowane. Zatem jeśli zasady proszą o rzucenie 2k10, rzucasz \ndwoma dziesięciościennymi kośćmi i dodajesz wyniki ich obu, \nna przykład rzut 0 i 3 oznacza wynik 13 (10+3=13).\nCzasami rzut kostką zostanie zmodyfikowany przez dodanie \nlub odjęcie liczby. Zatem rzut 1k10+4 oznacza rzut jedną \ndziesięciościenną kostką i dodanie do wyniku 4, natomiast rzut \n2k10-3 wskazuje, że należy rzucić dwoma dziesięciościennymi \nkośćmi i odjąć od sumarycznego wyniku 3.\nPonadto zasady wykorzystują rzut dwoma dziesięciościennymi \nkośćmi do uzyskania liczby od 1 do 100 (oznakowane jako \n1k100). Aby to zrobić, jedna dziesięciościenna kostka zostaje \nuznana za kość „dziesiątek”, a druga za kość „jedności”. Teraz \nrzuć dwoma kośćmi i odczytaj wynik jako liczbę dwucyfrową. \nUwaga, w tym przypadku wynik „0” na kostce odczytujemy \nzawsze właśnie jako zero! Zatem rzut 1 na kostce dziesiątek i 4 \nna kostce jedności daje wynik 14, a rzut 4 i 2 oznacza 42. Jeśli \nna obu kościach wypadło 0, wynik wynosi 100.\nKOŚCI ZOSTAŁY RZUCONE\nTomasz Otto (Order #44833549)')