## 1. Data Loading

In [1]:
# Load the data from a PDF file and extract the text

from langchain.document_loaders import PyMuPDFLoader
# Load the PDF file
loader = PyMuPDFLoader("pdf_file\AI, Automation, and War The Rise of a Military-Tech Complex (Anthony King).pdf")
documents = loader.load()

In [2]:
# Remove pages that are mostly whitespace or very short
documents = [
    doc for doc in documents
    if len(doc.page_content.strip()) > 100  # adjustable threshold
]

In [3]:
documents = [doc for doc in documents if doc.metadata["page"] > 8]

In [4]:
# Clean the extracted pdf text

import re

def clean_text(text: str) -> str:
    text = text.replace('\x0c', '')                 # common page-break character
    text = re.sub(r'\s+\n', '\n', text)             # remove spaces before newlines
    text = re.sub(r'\n{2,}', '\n\n', text)          # collapse multiple newlines
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)      # remove weird unicode
    text = re.sub(r' +', ' ', text)                 # remove extra spaces
    return text.strip()


In [5]:
# Apply cleaning
for doc in documents:
    doc.page_content = clean_text(doc.page_content)

In [6]:
# Display a sample of the extracted text after cleaning
for i, doc in enumerate(documents[100:]):
    print(f"\n--- Page {doc.metadata['page']} ---")
    print(doc.page_content[250:300])


--- Page 110 ---
 powers of AI, many have overlooked this human col

--- Page 111 ---
to examine
how the armed forces are actually using

--- Page 112 ---
bout to automate command,
then. Nevertheless, prec

--- Page 113 ---
 
ments of the planning process.
Planning is not r

--- Page 114 ---
 Russian attack on Kyiv in February 2022, the mode

--- Page 115 ---

ment learning; programmers specified the outcome,

--- Page 116 ---
 potential: It was a state up/scale up ethos: move

--- Page 117 ---
uration
of the database itself; commercial intelli

--- Page 118 ---
Maps.
The system provided a commander with route c

--- Page 119 ---
he battlefield in real time and to communicate wit

--- Page 120 ---
make soldiers redundant; rather, it enables them t

--- Page 121 ---
 facili 
tates the fusion of data from all sensors

--- Page 122 ---
 large lan 
guage models or generative AI, like Ch

--- Page 123 ---
 to move between macro understandings of regional 

--- Page 124 ---
 purely statisti

In [7]:
import re

def detect_back_matter_start(documents, threshold: float = 0.8) -> int | None:
    """
    Detect the index in the document list where back matter begins (e.g., References, Bibliography, Index, etc.)

    Parameters:
    ----------
    documents : List[Document]
        The list of LangChain Document objects (e.g., from PyMuPDFLoader)
    threshold : float
        Percentage (default 0.85) of the book after which back matter is expected.

    Returns:
    -------
    int | None
        Index of the first back matter page, or None if not found.
    """

    back_keywords = ["bibliography", "references", "index", "appendix", "notes"]
    total_docs = len(documents)

    # Add sequential index metadata if missing
    for idx, doc in enumerate(documents):
        doc.metadata["index"] = idx

    # Only scan the last (1 - threshold)% of the book
    search_start = int(total_docs * threshold)

    for i in range(search_start, total_docs):
        doc = documents[i]
        text = doc.page_content.lower()

        # Extract all short lines to look for section titles
        lines = text.splitlines()
        short_lines = [line.strip() for line in lines if 3 <= len(line.strip()) <= 40]

        for line in short_lines:
            if re.match(r"^(bibliography|references|index|appendix|notes)\b", line):
                print(f"🟡 Back matter detected on page {doc.metadata.get('page', 'unknown')} at index {i}")
                print(f"➡️ Section header: {line}")
                return i

    # If nothing found
    print("✅ No back matter section found with current heuristic.")
    return None

In [8]:
# Detect start of back matter
back_start_index = detect_back_matter_start(documents)

# Split the documents
if back_start_index:
    main_docs = documents[:back_start_index]
    back_docs = documents[back_start_index:]
else:
    main_docs = documents
    back_docs = []

print(f"Main content: {len(main_docs)} pages | Back matter: {len(back_docs)} pages")

🟡 Back matter detected on page 197 at index 186
➡️ Section header: notes
Main content: 186 pages | Back matter: 44 pages


In [33]:
len(main_docs)

186

## 2. Data Chunking

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=250,
    separators=["\n\n", "\n" ",", ".", " "]
)
chunks = text_splitter.split_documents(main_docs)

In [37]:
len(chunks)

623

In [39]:
chunks[0].page_content

'Preface ix\nempirical and methodological obstacles. Sociology aspires to be generalis \ning and abstract; it aims to offer not just a narrative history of an event but a\ntheory of social practice. Nevertheless, the best empirical sociology is almost\nalways situated in a concrete, definable location; it studies a par tic u lar group of\n people, in a defined organisation, doing something specific, at an identifiable\ntime and place. In my work on the armed forces, I have certainly always tried\nto employ this method. I have studied rapid reaction forces, infantry platoons,\nand divisional commanders. These actors are all located at specific places; it is\npos si ble to engage with them directly. As a result, I have often visited infantry\nbattalions and divisional headquarters.\nThis AI proj ect has sometimes defied that method. The military develop \nment of AI is diffuse. It is not located in one military unit, or one headquarters,\nor even just a few places. Its application is div

## 3. Data Embeddings (Convert text to numerical vector space)


In [40]:
from sentence_transformers import SentenceTransformer

# Load the local embedding model (downloaded)
embedding = SentenceTransformer("BAAI/bge-base-en-v1.5")

In [41]:
# Apply the embedding to the chunks
hf_embedding = embedding.encode([chunk.page_content for chunk in chunks])

print(f"The lenght of the embeddings vector is {len(hf_embedding[0])}")
print(f"The embeddings object is an array of {len(hf_embedding)} X {len(hf_embedding[0])}")

The lenght of the embeddings vector is 768
The embeddings object is an array of 623 X 768


In [42]:
hf_embedding[0]


array([ 9.20394994e-03,  7.86512718e-03, -6.30810037e-02, -2.11902391e-02,
        7.12757092e-03,  1.49508901e-02,  4.14107032e-02,  1.79693699e-02,
       -2.08139569e-02, -3.40114050e-02, -1.84762143e-02, -1.13194780e-02,
       -1.48003139e-02, -1.98904648e-02, -3.20124179e-02,  8.60024840e-02,
        1.79261602e-02,  2.30967137e-03,  2.12303437e-02, -5.96225969e-02,
        5.12609724e-03,  2.57874951e-02, -5.71699627e-03,  5.75492308e-02,
        5.34069613e-02,  1.38144400e-02,  6.87570125e-02,  1.12561332e-02,
       -1.50141586e-02,  1.12626888e-03,  1.36094652e-02, -4.48277295e-02,
        1.14510453e-03, -2.44689286e-02,  6.22081058e-03, -4.28039543e-02,
       -2.16012038e-02, -6.22260990e-03,  4.46221828e-02, -4.13314737e-02,
       -3.55018191e-02,  2.42774351e-03, -1.88593008e-02, -9.82947927e-03,
       -4.84281369e-02,  2.86089047e-03, -4.76106107e-02,  3.86904553e-02,
       -5.09798117e-02,  1.29286433e-02, -7.80302361e-02,  1.21401949e-02,
        6.08840361e-02,  

## 4. Storage Embeddings (Vector DB)

In [44]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings


index = faiss.IndexFlatIP(len(hf_embedding[0]))

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
# Create a vector store object
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

vector_store.add_documents(documents=chunks)

# Check the number of chunks that have been indexed
vector_store.index.ntotal

623

In [45]:
#  Save the vector store in persistent memory
vector_store.save_local(folder_path="./data",index_name="BDF_index")