## 1. Data Loading

In [1]:
# Load the data from a PDF file and extract the text

from langchain.document_loaders import PyMuPDFLoader
# Load the PDF file
loader = PyMuPDFLoader("pdf_file\AI, Automation, and War The Rise of a Military-Tech Complex (Anthony King).pdf")
documents = loader.load()

In [2]:
# Remove pages that are mostly whitespace or very short
documents = [
    doc for doc in documents
    if len(doc.page_content.strip()) > 100  # adjustable threshold
]

In [3]:
documents = [doc for doc in documents if doc.metadata["page"] > 8]

In [4]:
# Clean the extracted pdf text

import re

def clean_text(text: str) -> str:
    text = text.replace('\x0c', '')                 # common page-break character
    text = re.sub(r'\s+\n', '\n', text)             # remove spaces before newlines
    text = re.sub(r'\n{2,}', '\n\n', text)          # collapse multiple newlines
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)      # remove weird unicode
    text = re.sub(r' +', ' ', text)                 # remove extra spaces
    return text.strip()


In [5]:
# Apply cleaning
for doc in documents:
    doc.page_content = clean_text(doc.page_content)

In [6]:
# Display a sample of the extracted text after cleaning
for i, doc in enumerate(documents[100:]):
    print(f"\n--- Page {doc.metadata['page']} ---")
    print(doc.page_content[250:300])


--- Page 110 ---
 powers of AI, many have overlooked this human col

--- Page 111 ---
to examine
how the armed forces are actually using

--- Page 112 ---
bout to automate command,
then. Nevertheless, prec

--- Page 113 ---
 
ments of the planning process.
Planning is not r

--- Page 114 ---
 Russian attack on Kyiv in February 2022, the mode

--- Page 115 ---

ment learning; programmers specified the outcome,

--- Page 116 ---
 potential: It was a state up/scale up ethos: move

--- Page 117 ---
uration
of the database itself; commercial intelli

--- Page 118 ---
Maps.
The system provided a commander with route c

--- Page 119 ---
he battlefield in real time and to communicate wit

--- Page 120 ---
make soldiers redundant; rather, it enables them t

--- Page 121 ---
 facili 
tates the fusion of data from all sensors

--- Page 122 ---
 large lan 
guage models or generative AI, like Ch

--- Page 123 ---
 to move between macro understandings of regional 

--- Page 124 ---
 purely statisti

In [7]:
import re

def detect_back_matter_start(documents, threshold: float = 0.8) -> int | None:
    """
    Detect the index in the document list where back matter begins (e.g., References, Bibliography, Index, etc.)

    Parameters:
    ----------
    documents : List[Document]
        The list of LangChain Document objects (e.g., from PyMuPDFLoader)
    threshold : float
        Percentage (default 0.85) of the book after which back matter is expected.

    Returns:
    -------
    int | None
        Index of the first back matter page, or None if not found.
    """

    back_keywords = ["bibliography", "references", "index", "appendix", "notes"]
    total_docs = len(documents)

    # Add sequential index metadata if missing
    for idx, doc in enumerate(documents):
        doc.metadata["index"] = idx

    # Only scan the last (1 - threshold)% of the book
    search_start = int(total_docs * threshold)

    for i in range(search_start, total_docs):
        doc = documents[i]
        text = doc.page_content.lower()

        # Extract all short lines to look for section titles
        lines = text.splitlines()
        short_lines = [line.strip() for line in lines if 3 <= len(line.strip()) <= 40]

        for line in short_lines:
            if re.match(r"^(bibliography|references|index|appendix|notes)\b", line):
                print(f"🟡 Back matter detected on page {doc.metadata.get('page', 'unknown')} at index {i}")
                print(f"➡️ Section header: {line}")
                return i

    # If nothing found
    print("✅ No back matter section found with current heuristic.")
    return None

In [8]:
# Detect start of back matter
back_start_index = detect_back_matter_start(documents)

# Split the documents
if back_start_index:
    main_docs = documents[:back_start_index]
    back_docs = documents[back_start_index:]
else:
    main_docs = documents
    back_docs = []

print(f"Main content: {len(main_docs)} pages | Back matter: {len(back_docs)} pages")

🟡 Back matter detected on page 197 at index 186
➡️ Section header: notes
Main content: 186 pages | Back matter: 44 pages


In [9]:
# Print a sample of back docs
back_docs[0].page_content

'NOTES\n1. Robot Wars\n1. Ray Kurzweil, The Singularity is Near: When Humans Transcend Biology (London: Duck \nworth, 2005).\n2. Ray Kurzweil, The Singularity is Nearer: When We Merge with AI (Oxford: Bodley Head,\n2024).\n3. Kurzweil, The Singularity is Nearer, 10.\n4. James Lovelock, The Novacene: The Coming Age of Hyperintelligence (London: Penguin\nBooks, 2020), 111.\n5. Mustafa Suleyman with Michael Bhaskar, The Coming Wave: AI, Power and the Twenty- first\n Century s Greatest Dilemma (London: Bodley Head, 2023), 3.\n6. Melanie Mitchell, Artificial Intelligence: A Guide for Thinking Humans (London: Pelican,\n2019), 198 9.\n7. Suleyman, The Coming Wave, 53.\n8. Suleyman, The Coming Wave, 51.\n9. Suleyman, The Coming Wave, 53.\n10. Marcus du Sautoy, The Creativity Code: Art and Innovation in the Age of AI (Cambridge,\nMA: The Belknap Press, 2019), 31.\n11. Matthew Sparkes, DeepMind s Protein- Folding AI Cracks Biology s Biggest Prob lem ,\nNew Scientist, 28 July 2022, https:// www .

## 2. Data Chunking

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)
chunks = text_splitter.split_documents(main_docs)

In [11]:
type(chunks)

list

In [12]:
len(chunks)

880

In [13]:
main_docs[-1].page_content

'War at the Speed of Light 183\nwhich wants to wield military power has to embrace AI with which to enable\nand augment its armed forces. To do other wise would be like failing to adopt\ngunpowder, airpower, tanks, or aircraft or perhaps, even more aptly, failing\nto adopt mapping and charts.\nYet AI is not miraculous; it is not magic. AI offers novel capabilities, but its\npotential can be harnessed only through profound organisational reformation.\nA new relationship between the armed forces and the tech sector is required.\nConsequently, as armed forces pursue AI, a military- tech complex is\nappearing. In the next decade, the partnership between the state military\nforces and the private tech companies is likely to consolidate and deepen.\nUtopian or dystopian visions of war conducted by supercomputers and killer\ndrone swarms are phantasmagorical. Nevertheless, an emerging military- tech\ncomplex transforms the way in which states defend themselves and fight each\nother. The incre

In [14]:
back_docs[0].page_content

'NOTES\n1. Robot Wars\n1. Ray Kurzweil, The Singularity is Near: When Humans Transcend Biology (London: Duck \nworth, 2005).\n2. Ray Kurzweil, The Singularity is Nearer: When We Merge with AI (Oxford: Bodley Head,\n2024).\n3. Kurzweil, The Singularity is Nearer, 10.\n4. James Lovelock, The Novacene: The Coming Age of Hyperintelligence (London: Penguin\nBooks, 2020), 111.\n5. Mustafa Suleyman with Michael Bhaskar, The Coming Wave: AI, Power and the Twenty- first\n Century s Greatest Dilemma (London: Bodley Head, 2023), 3.\n6. Melanie Mitchell, Artificial Intelligence: A Guide for Thinking Humans (London: Pelican,\n2019), 198 9.\n7. Suleyman, The Coming Wave, 53.\n8. Suleyman, The Coming Wave, 51.\n9. Suleyman, The Coming Wave, 53.\n10. Marcus du Sautoy, The Creativity Code: Art and Innovation in the Age of AI (Cambridge,\nMA: The Belknap Press, 2019), 31.\n11. Matthew Sparkes, DeepMind s Protein- Folding AI Cracks Biology s Biggest Prob lem ,\nNew Scientist, 28 July 2022, https:// www .

## 3. Data Embeddings (Convert text to numerical vector space)


In [15]:
from sentence_transformers import SentenceTransformer

# Load the local embedding model (downloaded)
embeder = SentenceTransformer("all-MiniLM-L6-v2")

In [16]:
# Apply the embeder to the main_docs
hf_embeder = embeder.encode([doc.page_content for doc in main_docs])

print(f"The lenght of the embeddings vector is {len(hf_embeder[0])}")
print(f"The embeddings object is an array of {len(hf_embeder)} X {len(hf_embeder[0])}")

The lenght of the embeddings vector is 384
The embeddings object is an array of 186 X 384


## 4. Storage Embeddings (Vector DB)

In [18]:
hf_embeder[0]

array([-1.69724934e-02, -4.83500883e-02, -2.03245953e-02, -3.81211676e-02,
        1.81356929e-02,  4.71930578e-02, -1.19044986e-02,  1.17731774e-02,
        5.99678718e-02,  3.97858098e-02,  5.29544912e-02, -4.07153033e-02,
        6.03128299e-02,  1.64970681e-02,  3.99392061e-02, -5.91814891e-02,
       -9.50705202e-04, -1.24078542e-02,  1.78423449e-02, -1.87498815e-02,
       -5.99571094e-02, -8.76775905e-02,  2.88868025e-02,  7.22973980e-03,
       -2.48367377e-02, -4.14324179e-02,  3.56743708e-02, -3.27807330e-02,
       -3.17296274e-02, -1.97380912e-02,  7.87897781e-02,  1.19022772e-01,
        2.23127399e-02,  3.16709466e-02, -1.34714646e-02,  1.49367884e-01,
        3.15167643e-02,  6.26332089e-02,  7.19109103e-02,  6.33638352e-02,
       -9.64447856e-02, -3.35192531e-02,  4.96140532e-02, -7.06251785e-02,
        5.11290543e-02,  2.04872098e-02,  3.16320211e-02, -2.99573150e-02,
       -6.34517670e-02, -1.52521264e-02, -6.56698197e-02,  9.36113205e-03,
        5.11933863e-02,  

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

