## This notebook is an end to end build of your initial database 

Load your packages!

In [None]:
# standard library
import os
import json
import textwrap
import warnings
from typing import List, Optional, Tuple
from pprint import pprint
from collections import Counter
from datetime import datetime, date
import re
import time
from enum import Enum

# External Libraries
import pandas as pd
from tqdm import tqdm
from pydantic import BaseModel, PydanticDeprecatedSince20, Field
from huggingface_hub.utils import _deprecation
from concurrent.futures import ThreadPoolExecutor, as_completed
from numpy import dot
from numpy.linalg import norm

# Langchain specific
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_community.vectorstores import LanceDB
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

# LanceDB specific
import lancedb
from lancedb.pydantic import LanceModel, Vector

In [None]:
# load functions

def Salty_Detective(root_path: str, extensions: List[str], export_csv: bool = False) -> List[Tuple[str, str, int]]:
    """
    Scans through all subfolders from root_path and identifies files
    matching user-defined extensions. Optionally exports to CSV.

    Parameters:
        root_path (str): The directory path to search.
        extensions (List[str]): File extensions to search for (e.g., ['.pdf', '.txt']).
        export_csv (bool): If True, saves results to a timestamped CSV and opens it.

    Returns:
        List of (filename, full_path, is_repeated)
    """
    extensions = [ext.lower() for ext in extensions]
    collected_files = []

    for dirpath, _, filenames in os.walk(root_path):
        for file in filenames:
            if any(file.lower().endswith(ext) for ext in extensions):
                full_path = os.path.join(dirpath, file)
                collected_files.append((file, full_path))

    name_counts = Counter([name for name, _ in collected_files])
    results = [(name, path, int(name_counts[name] > 1)) for name, path in collected_files]
    df = pd.DataFrame(results, columns=["File Name", "Path", "Does it Repeat (0 or 1)"])
    if export_csv:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"Salty_Detective_Results_{timestamp}.csv"
        df.to_csv(filename, index=False)
        print(f"\n CSV exported as: {filename}")

        try:
            os.startfile(filename)
        except Exception as e:
            print(f"There is an issue opening the file automatically: {e}")

    return df


def extract_metadata(text: str, file_path: str) -> dict:
    '''
    Function to scrape keywords from the chunks
    '''
    prompt = f"""
Given the following text, extract:

1. A list of 10–20 keywords (comma-separated)


Text:
{text}

Return in this format:
KEYWORDS: ...
"""
    response = llm.invoke([{"role": "user", "content": prompt}])
    lines = response.content.splitlines()

    keywords = []
    for line in lines:
        if line.startswith("KEYWORDS:"):
            keywords = [kw.strip() for kw in line[len("KEYWORDS:"):].split(",")]


    return {
        "key_words": keywords,
        "file_extension": file_path  # full path used as metadata
    }

def safe_embed_documents(rows, max_chars=3000):
    """
    Tries to embed all rows. If a 413 error occurs, truncates long rows and retries.
    """
    try:
        texts = [row["text"] for row in rows]
        return tei_endpoint.embed_documents(texts)

    except Exception as e:
        if "413" in str(e) or "Payload Too Large" in str(e):
            print("Payload too large. Truncating and retrying...")

            truncated_rows = []
            for row in rows:
                text = row["text"]
                if len(text) > max_chars:
                    row = row.copy()
                    row["text"] = text[:max_chars]
                truncated_rows.append(row)

            texts = [row["text"] for row in truncated_rows]
            return tei_endpoint.embed_documents(texts)

        else:
            raise  # re-raise unknown errors

def is_bad_ocr(text, symbol_threshold=0.3, short_line_ratio=0.5, gibberish_line_ratio=0.5):
    '''
    function to search for poor lines of text in a md format, adjustable threshold if OCR has a known type of failure to look for
    '''

    lines = text.splitlines()
    total_lines = len(lines)
    if total_lines == 0:
        return True

    # Count lines with mostly non-alphanumerics
    noisy_lines = sum(1 for line in lines if len(re.findall(r'[^\w\s]', line)) / (len(line) + 1e-5) > 0.5)

    # Count short lines
    short_lines = sum(1 for line in lines if len(line.strip()) <= 3)

    # Detect gibberish lines (e.g., repeated single characters)
    gibberish_lines = sum(1 for line in lines if re.match(r'^([^\w\s])\1{2,}$', line.strip()))

    # Check overall symbol frequency
    total_chars = len(text)
    symbol_count = len(re.findall(r'[^\w\s]', text))
    symbol_ratio = symbol_count / (total_chars + 1e-5)

    # Calculate line ratios
    short_line_ratio_actual = short_lines / total_lines
    gibberish_ratio_actual = gibberish_lines / total_lines

    # Determine bad OCR based on thresholds
    if (
        symbol_ratio > symbol_threshold or
        short_line_ratio_actual > short_line_ratio or
        gibberish_ratio_actual > gibberish_line_ratio
    ):
        return True

    return False

def clean_ocr_text(text):
    
    '''
    A function to clean poor OCR of excess symbols, white spacea and repeated character that don't make sense
    '''
    # Remove ASCII noise lines (e.g., lines full of ~, -, _, etc.)
    text = re.sub(r'^[~\-_\\\/\.\'\*\s]{3,}$', '', text, flags=re.MULTILINE)

    # Remove excessive whitespace
    text = re.sub(r'\s{2,}', ' ', text)
    
    # Normalize newlines (remove multiple blank lines)
    text = re.sub(r'\n{2,}', '\n', text)

    # Remove control characters and non-printable ASCII
    text = re.sub(r'[^\x20-\x7E\n]', '', text)

    # Optional: remove leading/trailing whitespace from lines
    text = "\n".join([line.strip() for line in text.splitlines()])

    return text.strip()

### Step 1: Get all your file extensions

In [3]:
# === Example Setup ===
#/home/dylan/oa4910_folder/
folder = r"/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quarter_7/OA4820_Capstone_Course/" #
#extensions_to_find = ['.pdf', '.txt', '.pptx', '.docx', '.xlsx', '.csv']  # all the formats we typically use
extensions_to_find = ['.pdf']  # test with three different types

# === Run Salty Detective ===
results = Salty_Detective(folder, extensions_to_find, export_csv=False)

# === Print Results ===
results

Unnamed: 0,File Name,Path,Does it Repeat (0 or 1)
0,PORTABLE_CASTLE_Quick_Look_Final_20180601.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
1,PRESCIENT VISION Quick Look FINAL.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
2,PROSAIC TWAIN_Quick Look_V4.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
3,VISTA QUAKE Quick Look Final.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
4,(U)_ACV11_Final_Report.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
...,...,...,...
163,Wargaming_Effectiveness_2006.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
164,Warriors_Edge.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
165,WG_6-13_CTOC_Report.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0
166,Wing_and_a_Prayer.pdf,/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quar...,0


In [4]:
paths=results.Path.to_list()

In [5]:
paths[0]

'/mnt/c/Users/kowal/OneDrive/Documents/NPS/Quarter_7/OA4820_Capstone_Course/CapStone_PDFs_MAJ_Fritzschreck/CapStone_PDFs_MAJ_Fritzschreck/PORTABLE_CASTLE_Quick_Look_Final_20180601.pdf'

### Step 2: OCR doucments

All OCR was done via linux on HPC (Hamming) at NPS

In [None]:
'''
#!/bin/bash
#SBATCH --job-name=docling_batch
#SBATCH --output=logs/docling_%A_%a.out
#SBATCH --error=logs/docling_%A_%a.err
#SBATCH --array=0-143        # Adjust this range to match number of PDFs
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=23:00:00
#SBATCH --mem=16G
 
## Activate your conda environment
source /smallwork/$USER/comp3/bin/activate
## conda activate DL
 
## Get the filename for this task
PDF=$(sed -n "$((SLURM_ARRAY_TASK_ID+1))p" pdf_list.txt)
 
 
## Run docling
docling --from pdf --to md --vlm-model smoldocling --image-export-mode referenced --output ./docling_output/ --verbose "$PDF"
'''

###  Step 3: Establish connection to your embedding model and generating LLMs

In [7]:
# Define the path to the model
endpoint_path = "gpu3"
info = os.popen(f"curl -s http://trac-malenia.ern.nps.edu:8080/{endpoint_path}/info").read()
repo_id = "/".join(json.loads(info)['model_id'].split('/')[-2:])

print(f"Connected to Genrating LLM: {repo_id}")

# Create the wrapper for the endpoint
tgi_endpoint = HuggingFaceEndpoint(
    endpoint_url=f"http://trac-malenia.ern.nps.edu:8080/{endpoint_path}",
    max_new_tokens=256,
    temperature=0.0,
    do_sample=False
)

# Patch to avoid HuggingFace deprecation warnings
class LocalChatHuggingFace(ChatHuggingFace):
    def _resolve_model_id(self):
        pass

llm = LocalChatHuggingFace(
    llm=tgi_endpoint,
    model_id=repo_id,
    verbose=True
)

# check model's max context window
print(f'Model max context window: {llm.tokenizer.model_max_length}')


# Define the path to the small model
endpoint_path2 = "gpu1"
info = os.popen(f"curl -s http://trac-malenia.ern.nps.edu:8080/{endpoint_path2}/info").read()
repo_id2 = "/".join(json.loads(info)['model_id'].split('/')[-2:])

print(f"Connected to small LLM: {repo_id2}")

# Create the wrapper for the endpoint
tgi_endpoint2 = HuggingFaceEndpoint(
    endpoint_url=f"http://trac-malenia.ern.nps.edu:8080/{endpoint_path2}",
    max_new_tokens=256,
    temperature=0.0,
    do_sample=False
)

# Patch to avoid HuggingFace deprecation warnings
class LocalChatHuggingFace(ChatHuggingFace):
    def _resolve_model_id(self):
        pass

llm2 = LocalChatHuggingFace(
    llm=tgi_endpoint2,
    model_id=repo_id2,
    verbose=True
)

# check model's max context window
print(f'Model max context window: {llm2.tokenizer.model_max_length}')


endpoint_path3 = "gpu4"
info = os.popen(f"curl -s http://trac-malenia.ern.nps.edu:8080/{endpoint_path3}/info").read()
repo_id3 = "/".join(json.loads(info)['model_id'].split('/')[-2:])

print(f"Connected to embedding LLM: {repo_id3}")
# Use same endpoint for embeddings
tei_endpoint = HuggingFaceEndpointEmbeddings(
    model=f"http://trac-malenia.ern.nps.edu:8080/{endpoint_path3}/embed"  # Adjust path as needed
)

Connected to Genrating LLM: casperhansen/llama-3.3-70b-instruct-awq
Model max context window: 131072
Connected to small LLM: Qwen/Qwen2.5-7B-Instruct
Model max context window: 131072
Connected to embedding LLM: nomic-ai/nomic-embed-text-v1.5


### Step 4 Create Database Schema and establish metadata fields

In [8]:
class ChunkMetadata(BaseModel):
    file_extension: Optional[str] = ""
    title: Optional[str] = ""
    date_of_pub: Optional[date] = None
    domain: Optional[List[str]] = []
    agency: Optional[List[str]] = []
    cocom: Optional[List[str]] = []
    country: Optional[List[str]] = []
    category: Optional[str] = ""
    purpose: Optional[str] = ""
    key_words: Optional[List[str]] = []




class TextChunkSchema(LanceModel):
    id: str
    vector: Vector(768)
    metadata: ChunkMetadata = ChunkMetadata()
    text: str

# initialize a local LanceDB database
db = lancedb.connect("./lancedb")

# initialize an empty table in the database
table = db.create_table("Capstone_test_table",
                        schema=TextChunkSchema,
                        mode='overwrite', ### Only do overwrite if you want to wipe the table clean
                        #partitioned_by=["year_of_exercise", "agency_supporting", "combatant_command", "countries", "key_words"],
                        #mode='append',  # Append to the table if it already exists
                        #partitioned_by=["year_of_exercise", "agency_supporting", "combatant_command", "countries", "key_words"],
                        )

In [9]:

class Domain(str, Enum):
    Land = "Land"
    Maritime = "Maritime"
    Air = "Air"
    Space ="Space"
    Cyberspace= "Cyberspace"
    Other= "Other" #create an out for the model if no domain match well!
class COCOM(str, Enum):
    INDOPACOM = "INDOPACOM"
    PACOM = "PACOM"
    CENTCOM = "CENTCOM"
    EUCOM= "EUCOM"
    AFRICOM = "AFRICOM"
    NORTHCOM ="NORTHCOM"
    SOUTHCOM = "SOUTHCOM"
    SPACECOM =  "SPACECOM"
    CYBERCOM = "CYBERCOM"
    SOCOM = "SOCOM"
    STRATCOM = "STRATCOM"
    TRANSCOM = "TRANSCOM"
    Other= "Other" #create an out for the model if no domain match well!
class Wargame_type(str, Enum):
    policy_decision = "policy"
    capbility_requirement= 'capability'
    conop_strategy = 'conop'
    process_org = "process"
    other= "other"
class SearchSchema(BaseModel, use_enum_values=True):
    title: str = Field(description="the title of the wargame paper")
    date_of_pub: date = Field(description="the date the paper was written, if only a year is found, use January 1st. If only a month and year is found, use the 1st of the month.",
                             example=["1999-12-01","2001-01-30","2010-04-15"])
    agency: List[str] = Field(description="the list of agencies that either sponsored or hosted the wargame")
    domain: List[Domain] = Field(description="the list of domains covered in the wargame, these are limited to Land, Maritime, Air, Space and Cyberspace. An other category is left for any wargame that does" \
    "not fit one of the previous listed domains." \
    " Land: This domain encompasses the surface of the Earth, excluding the high-water mark, and includes land-based operations." \
    " Maritime: This domain includes oceans, seas, bays, estuaries, islands, and coastal areas, along with the airspace above these areas." \
    " Air: This domain refers to the atmosphere, extending from the Earth's surface to the altitude where its effects on operations become negligible." \
    " Space: This domain includes the area above the atmosphere where the effects of atmospheric drag on airborne objects become negligible." \
    " Cyberspace: This domain encompasses the interdependent networks of information technology infrastructures, the Internet, and other data, including computer systems and embedded processors." \
    " Other: This domain is only to be selected if all other domains do not match well to the wargame.")
    country: List[str] = Field(description="This is a list of participating countries or countries that are part of the wargame")
    cocom: List[COCOM] = Field (description="the list of combatant commands associated with the wargame," \
    " these are limited to AFRICOM, CENTCOM, CYBERCOM, EUCOM, INDOPACOM, NORTHCOM, SPACECOM, SOUTHCOM, STRATCOM, and TRANSCOM. An other category is left for any wargame that does" \
    "not fit one of the previous listed combatant commands." \
    "AFRICOM includes all countries in Africa accept Egypt." \
    "CENTCOM is includes Egypt, Yemen, Oman, United Arab Emirates, Qatar, Bahrain, Saudi Arabia, " \
    "Jordan, Israel, Lebanon, Syria, Iraq, Kuwait, Iran, Afganistan, Pakistan, Turkmenistan, Tajikistan, Uzbekistan, Kyrgyzstan and Kazakhstan." \
    "CYBRECOM defends the United States, countering hostile cyber actors alongside our interagency, industry, and international partners. Thic COCOM is focused on cyberspace." \
    "EUCOM is responsible for military operations in Europe and parts of Eurasia. EUCOM includes the following countries: Albania, Andorra, Austria, Belgium, Bosnia and Herzegovina, Bulgaria" \
    "Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Kosovo, Latvia, Liechtenstein, " \
    "Lithuania, Luxembourg, Malta, Moldova, Monaco, Montenegro, Netherlands, North Macedonia, Norway, Poland, Portugal, Romania, San Marino, Serbia, Slovakia, Slovenia, " \
    "Spain, Sweden, Switzerland, Ukraine, United Kingdom, Vatican City. EUCOM also includes the Artic" \
    "INDOPACOM includes 38 countries: China, Japan, Taiwan, South Korea (Republic of Korea), Norht Korea (Democratic People's Republic of Korea), Mongolia, " \
    "Brunei, Cambodia, Indonesia, Laos, Myanmar (Burma), Malaysia, Philippines, Singapore, Thailand, Timor-Leste, Vietnam, Bangladesh, Bhutan,India, Nepal, Guam," \
    "Maldives, Sri Lanka, Australia, New Zealand, Papua New Guinea, Solomon Islands, Nauru, Vanuatu, Fiji, Tonga, Tuvalu, Kiribati, Marshall Islands, Samoa, American Samoa." \
    "NORTHCOM includes Canada, Mexico, Bermuda, Bahamas and the United States." \
    "SOUTHCOM includes Antigua, barbuda, Argentina, Barbados, Belize, Bolivia, Brazil,CHile,Colombia, Costa Rica, Cuba," \
    "Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, " \
    "Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saunt Lucia, Saint Vincent and Grenadines, Suriname, Trinidad and Tobago, Uruguay, " \
    "Venezuela." \
    "SPACECOM conducts operations in, from and to space to deter conflict and if necessary, defeat aggression, deliver space combat power." \
    "SOCOM develops, and employs, the world's finest SOF to conduct global special operations and activities as part of the Joint Force, in concert with" \
    " the U.S. Government Interagency, Allies, and Partners, to support persistent, networked, and distributed combatant command operations and campaigns" \
    " against state and non-state actors all to protect and advance U.S. policies and objectives." \
    "STRATCOM is responsible for Strategic Deterrence, Nuclear Operations, Nuclear Command, Control, and Communications (NC3) Enterprise Operations," \
    " Joint Electromagnetic Spectrum Operations, Global Strike and Missile Threat Assessment." \
    "TRANSCOM provides transportation services and capabilities to the other combatant commands, the military services, and defense agencies." \
    " Other: This combatant command is only to be selected if all other combatant commands do not match well to the wargame." \
    "PACOM should be written as INDOPACOM"
    )
    category: Wargame_type = Field(description= "The wargame type limited to policy, capability, conop, process or other." \
    "Policy wargames explore the implications of policy decisions in complex and uncertain environments. These games involve " \
    "participants—often experts, officials, or stakeholders—who role-play different actors to test strategies, forecast outcomes,"
    " and identify potential risks and unintended consequences." \
    "Capability wargames are structured simulations used to identify, test, and refine the capabilities that an organization—typically" \
    " a military, government agency, or large institution—needs to effectively operate in future scenarios. These wargames focus less" \
    " on specific policy decisions and more on the tools, technologies, personnel, infrastructure, and systems required to achieve known strategic objectives." \
    "Conop wargames are defined as scenario-driven event designed to explore and validate how an organization will employ its forces, capabilities, or resources to " \
    "accomplish strategic objectives under realistic and often adversarial conditions. Conop wargames test operational concepts in simulated real-world or " \
    "future environments." \
    "Process wargames are secanrio-driven exercises designed to test, evaluate, and improve the internal processes, workflows, coordination mechanisms, and " \
    "organizational structures of an entity—such as a military unit or government agency under realistic or stressful conditions." \
    "If no wargame type fits the policy, capabilty, conop, or process wargame definitions, choose other.")
    purpose: str = Field(description="Brief summary of the purpose of the wargame, no more than a paragraph long")


class SearchSchema2(BaseModel):
    keywords: List[str] = Field(description="This is a list of 10-20 keywords in the chunk.")
    country: List[str] = Field(description="This is a list of countries in the chunk. This can be an empyt list if no country is mentioned in the chunk.")
    

In [None]:

pydantic_parser = PydanticOutputParser(pydantic_object=SearchSchema)
format_instructions = pydantic_parser.get_format_instructions()

# The Pydantic model creates the formatting instructions to be included in the prompt
# Here is the what those instructions look like
print(format_instructions)


pydantic_parser2 = PydanticOutputParser(pydantic_object=SearchSchema2)
format_instructions2 = pydantic_parser2.get_format_instructions()

# The Pydantic model creates the formatting instructions to be included in the prompt
# Here is the what those instructions look like
print(format_instructions2)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"COCOM": {"enum": ["INDOPACOM", "PACOM", "CENTCOM", "EUCOM", "AFRICOM", "NORTHCOM", "SOUTHCOM", "SPACECOM", "CYBERCOM", "SOCOM", "STRATCOM", "TRANSCOM", "Other"], "title": "COCOM", "type": "string"}, "Domain": {"enum": ["Land", "Maritime", "Air", "Space", "Cyberspace", "Other"], "title": "Domain", "type": "string"}, "Wargame_type": {"enum": ["policy", "capability", "conop", "process", "other"], "title": "Wargame_type", "type": "string"}}, "properties": {"title": {"description": "the title of the wargame paper", "title": "Title", "ty

In [11]:
API_prompt = PromptTemplate(
        template="""
        You are an expert wargame reviewer. Analyze the following wargame report in detail.
        Break down each major metadata parameter from the format instructions, provide a brief summary.
        
        Wargame Report: {wargame_report}
        
        {format_instructions}
        
        Analyze the wargame following the exact format specified above.
        """,
        input_variables=["wargame_report"],
        partial_variables={"format_instructions": pydantic_parser.get_format_instructions()}
    )
    
chain = API_prompt | llm | pydantic_parser

### Step 4: Iterate through every document and store in Database

In [15]:
text_splitter = CharacterTextSplitter(
    separator="\n",          # Split along line breaks
    chunk_size=1500,          # Up to 500 characters per chunk
    chunk_overlap=200        # Optional: adds continuity between chunks
)

In [19]:
# get list of paths to md files in specified directory
directory = "./final_docling_output"
doc_paths = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('.md')]

## MArcus used unstructure loader...why?
docs = [TextLoader(doc_path).load() for doc_path in doc_paths]
docs_flat_full = [item for sublist in docs for item in sublist]

In [None]:
docs_flat=docs_flat_full #allows you to test the code without running the whole dang thing
doc_batch_size=4
full_doc_metadata={}
error_log = {} 
failed_chunks = []
# Suppress deprecation warnings
warnings.filterwarnings("ignore", category=PydanticDeprecatedSince20)
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")
for j in range(0, len(docs_flat), doc_batch_size):
    remove_doc=[]
    cutoff=j+doc_batch_size-1
    if cutoff>len(docs_flat):
        cutoff=len(docs_flat)-1
    print(f'Working documents: {j} to {cutoff}')
    current_docs=docs_flat[j:j+doc_batch_size]
    full_doc_metadata={}
    for doc in reversed(current_docs): 
        print(f'Starting on {doc.metadata['source']} document')
        try:
            json_output=chain.invoke({"wargame_report":doc.page_content})  
            full_doc_metadata[doc.metadata['source']]=json_output.model_dump()
        except Exception as e:
            if "422" in str(e):
                print(f"too many tokens, attempting to shorten document with {len(doc.page_content)} characters")
                time.sleep(5)
                try:
                    if is_bad_ocr(doc.page_content):
                        print("detected bad ocr, cleaning and retrying API call")
                        text=clean_ocr_text(doc.page_content)
                        text=doc.page_content[:150000] +"..."                        
                        json_output=chain.invoke({"wargame_report":text})  
                        full_doc_metadata[doc.metadata['source']]=json_output.model_dump() 
                    else:
                        print("good ocr, document is just too long, trimming end") 
                        text=clean_ocr_text(doc.page_content)
                        text=doc.page_content[:150000] +"..."
                        json_output=chain.invoke({"wargame_report":text})  
                        full_doc_metadata[doc.metadata['source']]=json_output.model_dump() 
                except Exception as f:
                    print(f'error at {doc.metadata['source']}')
                    print(f'error code: {f}')
                    error_log[doc.metadata['source']]= str(f)
                    current_docs.remove(doc)
            elif "Read timed out." in str(e):
                print("time out error, pausing for 10 seconds")
                time.sleep(10)
                try:
                    json_output=chain.invoke({"wargame_report":doc.page_content})  
                    full_doc_metadata[doc.metadata['source']]=json_output.model_dump() 
                except Exception as f:
                    print("time out error not fixed")
                    print(f'error at {doc.metadata['source']}')
                    print(f'error code: {f}')
                    error_log[doc.metadata['source']]= str(f)
                    current_docs.remove(doc)          
           # if str(e)
            else:
                print(f'error at {doc.metadata['source']}')
                print(f'error code: {e}')
                error_log[doc.metadata['source']]= str(e)
                current_docs.remove(doc)

    # Create IDs for each chunk so we're not inserting duplicates
    existing_ids = set()
    print()
    try:
        existing_ids = set(row["id"] for row in table.to_arrow().to_pylist())
        print(f"Loaded {len(existing_ids)} existing IDs from LanceDB.")
    except Exception as e:
        print(f"Warning: Could not load existing IDs. Starting fresh. Reason: {e}")

    # Initialize counters
    skipped = 0
    processed = 0


    # Batch size for LLM + TEI
    batch_size = 32
    updates = []
    texts_to_embed = []
    rows_to_embed = []

    #remove docs where API call failed

    # Apply splitter to your loaded docs
    chunked_docs = []
    for doc in current_docs:
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            chunked_docs.append({
                "text": chunk,
                "file_path": doc.metadata.get("source", ""),
                "chunk_index": i
            })

    print(f"Created {len(chunked_docs)} text chunks.")
    # Processing loop
    for i in tqdm(range(0, len(chunked_docs), batch_size), desc="Processing chunks"):
        batch = chunked_docs[i:i + batch_size]

        # Filter out already-inserted chunks
        filtered_batch = []
        for chunk in batch:
            row_id = f"{chunk['file_path']}#{chunk['chunk_index']}"
            if row_id not in existing_ids:
                filtered_batch.append((row_id, chunk))
            else:
                skipped += 1

        if not filtered_batch:
            continue

        # Parallelize metadata extraction
        # Create worker threads
        with ThreadPoolExecutor(max_workers=5) as executor:
            # Submit tasks to the executor
            futures = {
                executor.submit(extract_metadata, chunk["text"], chunk["file_path"]): (row_id, chunk)
                for row_id, chunk in filtered_batch
            }
            # Process results as they complete
            for future in as_completed(futures):
                try:
                    metadata = future.result()
                    row_id, chunk = futures[future]
                    # Add the remaining schema-required fields
                    # ** unpack the metadata dict
                    row_metadata = ChunkMetadata(
                        **{
                            **metadata,
                            **full_doc_metadata[chunk["file_path"]]
                        }
                    ).model_dump()
                    # Package into full LanceDB-compatible row
                    rows_to_embed.append({
                        "id": row_id,
                        "text": chunk["text"],
                        "metadata": row_metadata
                    })
                    texts_to_embed.append(chunk["text"])
                    processed += 1

                except Exception as e:
                    file_path = futures[future][1]['file_path']
                    print(f"Metadata extraction failed for {file_path}: {e}")
                    failed_chunks.append(file_path)

        # Run TEI Embedding
        if texts_to_embed:
            # Embed the texts in batches
            vectors = safe_embed_documents(rows_to_embed)

            # Add vectors to the rows
            # zip the vectors with the rows of text
            for row, vector in zip(rows_to_embed, vectors):
                updates.append({
                    "id": row["id"],
                    "vector": vector,
                    "text": row["text"],
                    "metadata": row["metadata"]
                })

            existing_ids.update(row["id"] for row in rows_to_embed)
            rows_to_embed = []
            texts_to_embed = []

        # Add LanceDB updates in batches
        # Check if the updates list has reached the batch size
        if len(updates) >= 256:
            table.add(updates)
            updates = []

    # Only for the last batch which may not be full
    if texts_to_embed:
        vectors = safe_embed_documents(rows_to_embed)

        for row, vector in zip(rows_to_embed, vectors):
            updates.append({
                "id": row["id"],
                "vector": vector,
                "text": row["text"],
                "metadata": row["metadata"]
            })
        existing_ids.update(row["id"] for row in rows_to_embed)
    # final catch for updates
    if updates:
        table.add(updates)

    # Final Report
    print("\nFinished processing.")
    print(f"Chunks inserted      : {processed}")
    print(f"Chunks skipped       : {skipped}")
    print(f"Chunks failed (LLM)  : {len(failed_chunks)}")
    print(f"Total chunked_docs   : {len(chunked_docs)}")

    if failed_chunks:
        print("Failed chunks:")
        for path in failed_chunks[:5]:
            print(f" - {path}")
        if len(failed_chunks) > 5:
            print(f" ...and {len(failed_chunks) - 5} more.")

    print("\n \n")     
 

Working documents: 0 to 3
Starting on ./final_docling_output/(U)_DON_Climate_Action_2030_TTX_III_Final_Report.md document
Starting on ./final_docling_output/(U)_China_Future_report_AWC_15_May_15.md document
Starting on ./final_docling_output/(U)_ACV11_QuickLook_Final.md document
Starting on ./final_docling_output/(U)_ACV11_Final_Report.md document

Loaded 0 existing IDs from LanceDB.
Created 170 text chunks.


Processing chunks: 100%|██████████| 6/6 [02:39<00:00, 26.54s/it]



Finished processing.
Chunks inserted      : 170
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 170

 

Working documents: 4 to 7
Starting on ./final_docling_output/(U)_EW09_Quicklook.md document
Starting on ./final_docling_output/(U)_EW09_Final_Report.md document
Starting on ./final_docling_output/(U)_EW08_Quicklook.md document
Starting on ./final_docling_output/(U)_EW08_Final_Report.md document

Loaded 170 existing IDs from LanceDB.
Created 174 text chunks.


Processing chunks: 100%|██████████| 6/6 [02:44<00:00, 27.46s/it]



Finished processing.
Chunks inserted      : 174
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 174

 

Working documents: 8 to 11
Starting on ./final_docling_output/(U)_Syria_Analysis_Game_Final_Report.md document
Starting on ./final_docling_output/(U)_Expeditionary_Medicine_2015_Final_Report.md document
Starting on ./final_docling_output/(U)_EW11_Final_Report.md document
Starting on ./final_docling_output/(U)_EW10_Final_Report.md document


Created a chunk of size 1507, which is longer than the specified 1500
Created a chunk of size 1563, which is longer than the specified 1500
Created a chunk of size 1673, which is longer than the specified 1500
Created a chunk of size 1584, which is longer than the specified 1500



Loaded 344 existing IDs from LanceDB.
Created 337 text chunks.


Processing chunks: 100%|██████████| 11/11 [05:07<00:00, 27.94s/it]



Finished processing.
Chunks inserted      : 337
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 337

 

Working documents: 12 to 15
Starting on ./final_docling_output/ADA542668.md document
Starting on ./final_docling_output/2013_Naval_Services_Game_Report.md document
Starting on ./final_docling_output/2010_MAD_Game_Part1.md document
Starting on ./final_docling_output/1994_Russian-UK-US.md document


Created a chunk of size 1725, which is longer than the specified 1500
Created a chunk of size 1672, which is longer than the specified 1500
Created a chunk of size 2178, which is longer than the specified 1500
Created a chunk of size 1736, which is longer than the specified 1500
Created a chunk of size 1616, which is longer than the specified 1500
Created a chunk of size 1867, which is longer than the specified 1500
Created a chunk of size 2174, which is longer than the specified 1500



Loaded 681 existing IDs from LanceDB.
Created 438 text chunks.


Processing chunks: 100%|██████████| 14/14 [06:32<00:00, 28.02s/it]



Finished processing.
Chunks inserted      : 438
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 438

 

Working documents: 16 to 19
Starting on ./final_docling_output/Algernon_Wargame.md document
Starting on ./final_docling_output/After_Action_Report_JCRX_FLINTLOCK_86.md document
Starting on ./final_docling_output/Adding_Weather_to_Wargames.md document
Starting on ./final_docling_output/Addendum_to_ARL-TR-4005.md document



Created a chunk of size 1660, which is longer than the specified 1500
Created a chunk of size 1660, which is longer than the specified 1500
Created a chunk of size 1729, which is longer than the specified 1500
Created a chunk of size 1746, which is longer than the specified 1500
Created a chunk of size 1522, which is longer than the specified 1500
Created a chunk of size 1514, which is longer than the specified 1500


Loaded 1119 existing IDs from LanceDB.
Created 292 text chunks.


Processing chunks: 100%|██████████| 10/10 [05:14<00:00, 31.44s/it]



Finished processing.
Chunks inserted      : 292
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 292

 

Working documents: 20 to 23
Starting on ./final_docling_output/ARDE_WarGame_1960.md document
Starting on ./final_docling_output/Architecture_Tradeoff_Analysis_2001.md document
Starting on ./final_docling_output/All_Hazards_Plan_Validation_Table_Top_Exercise-ARA.md document
Starting on ./final_docling_output/All_Hazards_After_Action_Report.md document



Created a chunk of size 1535, which is longer than the specified 1500


Loaded 1411 existing IDs from LanceDB.
Created 105 text chunks.


Processing chunks: 100%|██████████| 4/4 [01:41<00:00, 25.50s/it]



Finished processing.
Chunks inserted      : 105
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 105

 

Working documents: 24 to 27
Starting on ./final_docling_output/Battle_Simulations.md document
Starting on ./final_docling_output/Battle_of_Jutland_1922_2020.md document
Starting on ./final_docling_output/A_Report_by_the_Military_Committee_on_NATO_EXERCISES_1959.md document
Starting on ./final_docling_output/Auger_1997.md document



Created a chunk of size 1682, which is longer than the specified 1500
Created a chunk of size 1522, which is longer than the specified 1500
Created a chunk of size 1618, which is longer than the specified 1500


Loaded 1516 existing IDs from LanceDB.
Created 147 text chunks.


Processing chunks: 100%|██████████| 5/5 [02:15<00:00, 27.00s/it]



Finished processing.
Chunks inserted      : 147
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 147

 

Working documents: 28 to 31
Starting on ./final_docling_output/CHENNAULT_EVENT.md document
Starting on ./final_docling_output/Caspian_Sea_Game_1998.md document
Starting on ./final_docling_output/CARMAX_83A_Research_Project.md document
too many tokens, attempting to shorten document with 576622 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/CAMMS_v_CPX.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/CAMMS_v_CPX.md
error code: Failed to parse SearchSchema from completion {"title": "EVALUATION OF A COMPUTER-ASSISTED BATTLE SIMULATION: CAMMS VERSUS A CPX", "date_of_pub": "1979-04-01", "agency": ["U.S. Army Research Institute for the Behavioral and Social Sciences"], "domain": ["Land"], "country": ["United States"], "cocom": ["TRADOC"], "category": "capabi

Created a chunk of size 1888, which is longer than the specified 1500
Created a chunk of size 1750, which is longer than the specified 1500
Created a chunk of size 1544, which is longer than the specified 1500
Created a chunk of size 1517, which is longer than the specified 1500
Created a chunk of size 1667, which is longer than the specified 1500
Created a chunk of size 1701, which is longer than the specified 1500
Created a chunk of size 1744, which is longer than the specified 1500
Created a chunk of size 1804, which is longer than the specified 1500
Created a chunk of size 2220, which is longer than the specified 1500
Created a chunk of size 1532, which is longer than the specified 1500
Created a chunk of size 1772, which is longer than the specified 1500


Loaded 1663 existing IDs from LanceDB.
Created 506 text chunks.


Processing chunks: 100%|██████████| 16/16 [10:09<00:00, 38.09s/it]



Finished processing.
Chunks inserted      : 506
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 506

 

Working documents: 32 to 35
Starting on ./final_docling_output/Counter-Insurgency_Study.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Counter-Insurgency_Study.md
error code: Failed to parse SearchSchema from completion {"title": "AGILE-COIN GAME", "date_of_pub": "1965-11-01", "agency": ["ARPA"], "domain": ["Land"], "country": ["United States"], "cocom": ["INDOPACOM"], "category": ["capability"], "purpose": "The purpose of this wargame is to explore the feasibility of computer models based on game findings that imitate some of the major aspects of the terror-phase of internal revolutionary conflict."}. Got: 1 validation error for SearchSchema
category
  Input should be 'policy', 'capability', 'conop', 'process' or 'other' [type=enum, input_value=['capability'], input_type=list]
    For further info

Processing chunks: 100%|██████████| 2/2 [00:40<00:00, 20.39s/it]



Finished processing.
Chunks inserted      : 39
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 39

 

Working documents: 36 to 39
Starting on ./final_docling_output/DSB_report_2021.md document
Starting on ./final_docling_output/DoD_Activities_1989.md document
Starting on ./final_docling_output/document.md document
Starting on ./final_docling_output/Defend_Forward_Game_Report_2019.md document



Created a chunk of size 2223, which is longer than the specified 1500
Created a chunk of size 1503, which is longer than the specified 1500
Created a chunk of size 1611, which is longer than the specified 1500
Created a chunk of size 1743, which is longer than the specified 1500
Created a chunk of size 1588, which is longer than the specified 1500
Created a chunk of size 1573, which is longer than the specified 1500
Created a chunk of size 2488, which is longer than the specified 1500
Created a chunk of size 2418, which is longer than the specified 1500
Created a chunk of size 1875, which is longer than the specified 1500
Created a chunk of size 1875, which is longer than the specified 1500
Created a chunk of size 1875, which is longer than the specified 1500
Created a chunk of size 1875, which is longer than the specified 1500
Created a chunk of size 1875, which is longer than the specified 1500
Created a chunk of size 1728, which is longer than the specified 1500
Created a chunk of s

Loaded 2208 existing IDs from LanceDB.
Created 303 text chunks.


Processing chunks: 100%|██████████| 10/10 [04:37<00:00, 27.78s/it]



Finished processing.
Chunks inserted      : 303
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 303

 

Working documents: 40 to 43
Starting on ./final_docling_output/Exhibit_R-2_Iron_Crucible.md document
Starting on ./final_docling_output/Exercise_Cygnus_Report.md document
Starting on ./final_docling_output/ExerciseViking22_Final_Report.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/ExerciseViking22_Final_Report.md
error code: Failed to parse SearchSchema from completion {"Title": "VIKING 22", "DateOfPub": "2022-03-20", "Agency": ["NATO", "UN"], "Domain": ["Land", "Maritime", "Air", "Space", "Cyberspace"], "Country": ["Sweden", "Finland", "Brazil", "Southland"], "Cocom": ["EUCOM", "NORTHCOM", "INDOPACOM", "CENTCOM"], "Category": "conop", "Purpose": "Exercise Viking is an important exercise and a great vantage between Military, Civilian and police. This exercise will help to expand the horizon of kno

Created a chunk of size 1688, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of size 8149, which is longer than the specified 1500
Created a chunk of s

Loaded 2511 existing IDs from LanceDB.
Created 309 text chunks.


Processing chunks: 100%|██████████| 10/10 [04:12<00:00, 25.24s/it]



Finished processing.
Chunks inserted      : 309
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 309

 

Working documents: 44 to 47
Starting on ./final_docling_output/FMD_TTX_ESF11_Report.md document
Starting on ./final_docling_output/Fleet_Arctic_Operations_Game.md document
Starting on ./final_docling_output/Final_Production_2001_2019.md document
too many tokens, attempting to shorten document with 621940 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/Falklands_Wargame.md document



Created a chunk of size 2656, which is longer than the specified 1500
Created a chunk of size 2613, which is longer than the specified 1500
Created a chunk of size 1764, which is longer than the specified 1500
Created a chunk of size 1764, which is longer than the specified 1500
Created a chunk of size 1626, which is longer than the specified 1500
Created a chunk of size 1703, which is longer than the specified 1500
Created a chunk of size 1866, which is longer than the specified 1500
Created a chunk of size 1661, which is longer than the specified 1500
Created a chunk of size 1530, which is longer than the specified 1500
Created a chunk of size 1800, which is longer than the specified 1500


Loaded 2820 existing IDs from LanceDB.
Created 741 text chunks.


Processing chunks: 100%|██████████| 24/24 [13:07<00:00, 32.81s/it]



Finished processing.
Chunks inserted      : 741
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 741

 

Working documents: 48 to 51
Starting on ./final_docling_output/Game_Report_Global08.md document
Starting on ./final_docling_output/FY03_Wargaming_Assessment_Report.md document
Starting on ./final_docling_output/Future_Warfare_20XX_Wargame_Series_Report.md document
Starting on ./final_docling_output/Framework_for_MNE_5.md document



Created a chunk of size 1621, which is longer than the specified 1500
Created a chunk of size 1680, which is longer than the specified 1500
Created a chunk of size 1626, which is longer than the specified 1500
Created a chunk of size 1506, which is longer than the specified 1500


Loaded 3561 existing IDs from LanceDB.
Created 474 text chunks.


Processing chunks: 100%|██████████| 15/15 [07:05<00:00, 28.36s/it]



Finished processing.
Chunks inserted      : 474
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 474

 

Working documents: 52 to 55
Starting on ./final_docling_output/Global_Wargame_2000.md document
Starting on ./final_docling_output/Global_Shipping_Game_2010.md document
Starting on ./final_docling_output/Globally_Integrated_Logistics_2017.md document
time out error, pausing for 10 seconds
Starting on ./final_docling_output/GAMMA_MNE_4.md document



Created a chunk of size 1627, which is longer than the specified 1500


Loaded 4035 existing IDs from LanceDB.
Created 459 text chunks.


Processing chunks: 100%|██████████| 15/15 [06:55<00:00, 27.71s/it]



Finished processing.
Chunks inserted      : 459
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 459

 

Working documents: 56 to 59
Starting on ./final_docling_output/GridEx_I_Report.md document
Starting on ./final_docling_output/GridEx_IV_Report.md document
Starting on ./final_docling_output/GridEx_II_Report.md document
Starting on ./final_docling_output/GridEx_III_Report.md document



Created a chunk of size 1522, which is longer than the specified 1500
Created a chunk of size 1606, which is longer than the specified 1500


Loaded 4494 existing IDs from LanceDB.
Created 233 text chunks.


Processing chunks: 100%|██████████| 8/8 [03:20<00:00, 25.02s/it]



Finished processing.
Chunks inserted      : 233
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 233

 

Working documents: 60 to 63
Starting on ./final_docling_output/Information_Flow_1982.md document
Starting on ./final_docling_output/GridEx_V_Report.md document
Starting on ./final_docling_output/GridEx_VI_Report.md document
Starting on ./final_docling_output/GridEx_VII_Report.md document



Created a chunk of size 1574, which is longer than the specified 1500
Created a chunk of size 1725, which is longer than the specified 1500
Created a chunk of size 2580, which is longer than the specified 1500


Loaded 4727 existing IDs from LanceDB.
Created 260 text chunks.


Processing chunks: 100%|██████████| 9/9 [04:27<00:00, 29.69s/it]



Finished processing.
Chunks inserted      : 260
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 260

 

Working documents: 64 to 67
Starting on ./final_docling_output/Joint_Staff_OrgChart_4-27-23.md document
Starting on ./final_docling_output/Issues_Secretary_of_Navy_Wargame_94.md document
Starting on ./final_docling_output/Irregular_Challenges_2010.md document
Starting on ./final_docling_output/Inter-American_Game_Report.md document



Created a chunk of size 1556, which is longer than the specified 1500
Created a chunk of size 1589, which is longer than the specified 1500
Created a chunk of size 1828, which is longer than the specified 1500


Loaded 4987 existing IDs from LanceDB.
Created 291 text chunks.


Processing chunks: 100%|██████████| 10/10 [05:14<00:00, 31.42s/it]



Finished processing.
Chunks inserted      : 291
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 291

 

Working documents: 68 to 71
Starting on ./final_docling_output/Kuznetsov_13_Game_Report.md document
Starting on ./final_docling_output/Kuznetsov_11_Game_Report.md document
Starting on ./final_docling_output/KIBOWI_Netherlands_Army.md document
Starting on ./final_docling_output/Key_NATO_Exercises_2021.md document

Loaded 5278 existing IDs from LanceDB.
Created 97 text chunks.


Processing chunks: 100%|██████████| 4/4 [01:28<00:00, 22.24s/it]



Finished processing.
Chunks inserted      : 97
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 97

 

Working documents: 72 to 75
Starting on ./final_docling_output/Manual_FoodSafety_Slovak_2017.md document
Starting on ./final_docling_output/Manhattan_2001.md document
Starting on ./final_docling_output/Mali_Analysis_Strategic_Wargaming_Series.md document
Starting on ./final_docling_output/LOGWAR_15_Analysis_Report.md document
time out error, pausing for 10 seconds



Created a chunk of size 2017, which is longer than the specified 1500


Loaded 5375 existing IDs from LanceDB.
Created 306 text chunks.


Processing chunks: 100%|██████████| 10/10 [05:30<00:00, 33.05s/it]



Finished processing.
Chunks inserted      : 306
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 306

 

Working documents: 76 to 79
Starting on ./final_docling_output/ME7_Deterrence.md document
Starting on ./final_docling_output/ME7_Access_to_Space.md document
Starting on ./final_docling_output/Maritime_Stability_Operations_2011.md document
Starting on ./final_docling_output/Maritime_Domain_Awareness_Operational_Game_2010.md document



Created a chunk of size 1539, which is longer than the specified 1500
Created a chunk of size 1770, which is longer than the specified 1500
Created a chunk of size 1778, which is longer than the specified 1500
Created a chunk of size 1636, which is longer than the specified 1500


Loaded 5681 existing IDs from LanceDB.
Created 390 text chunks.


Processing chunks: 100%|██████████| 13/13 [05:40<00:00, 26.19s/it]



Finished processing.
Chunks inserted      : 390
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 390

 

Working documents: 80 to 83
Starting on ./final_docling_output/ME7_Outcome_1_092012.md document
Starting on ./final_docling_output/ME7_Mediterranean.md document
Starting on ./final_docling_output/ME7_Maritime_Security_Arctic.md document
Starting on ./final_docling_output/ME7_Diagrams.md document

Loaded 6071 existing IDs from LanceDB.
Created 227 text chunks.


Processing chunks: 100%|██████████| 8/8 [03:17<00:00, 24.71s/it]



Finished processing.
Chunks inserted      : 227
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 227

 

Working documents: 84 to 87
Starting on ./final_docling_output/ME7_Space_Dependencies.md document
Starting on ./final_docling_output/ME7_Outcome_4__2013.md document
Starting on ./final_docling_output/ME7_Outcome_3_Cyber_2012.md document
Starting on ./final_docling_output/ME7_Outcome_3.md document



Created a chunk of size 1565, which is longer than the specified 1500
Created a chunk of size 1904, which is longer than the specified 1500
Created a chunk of size 2314, which is longer than the specified 1500
Created a chunk of size 1510, which is longer than the specified 1500
Created a chunk of size 1967, which is longer than the specified 1500
Created a chunk of size 1535, which is longer than the specified 1500


Loaded 6298 existing IDs from LanceDB.
Created 308 text chunks.


Processing chunks: 100%|██████████| 10/10 [04:34<00:00, 27.42s/it]



Finished processing.
Chunks inserted      : 308
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 308

 

Working documents: 88 to 91
Starting on ./final_docling_output/MNE_5_Findings.md document
Starting on ./final_docling_output/MNE_5_Cooperative.md document
Starting on ./final_docling_output/MNE4_Modeling_and_Simulation.md document
Starting on ./final_docling_output/Memo_31_081956.md document



Created a chunk of size 1632, which is longer than the specified 1500
Created a chunk of size 1903, which is longer than the specified 1500
Created a chunk of size 1620, which is longer than the specified 1500
Created a chunk of size 2148, which is longer than the specified 1500
Created a chunk of size 1698, which is longer than the specified 1500
Created a chunk of size 1686, which is longer than the specified 1500
Created a chunk of size 1693, which is longer than the specified 1500
Created a chunk of size 1616, which is longer than the specified 1500


Loaded 6606 existing IDs from LanceDB.
Created 124 text chunks.


Processing chunks: 100%|██████████| 4/4 [01:47<00:00, 26.94s/it]



Finished processing.
Chunks inserted      : 124
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 124

 

Working documents: 92 to 95
Starting on ./final_docling_output/Naval_Mine_Anti-Submarine_2013.md document
Starting on ./final_docling_output/NATO_EXERCISES_1959.md document
Starting on ./final_docling_output/MNE_7_Product_Catalogue.md document
Starting on ./final_docling_output/MNE_5_Results_and_Products.md document

Loaded 6730 existing IDs from LanceDB.
Created 97 text chunks.


Processing chunks: 100%|██████████| 4/4 [01:32<00:00, 23.14s/it]



Finished processing.
Chunks inserted      : 97
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 97

 

Working documents: 96 to 99
Starting on ./final_docling_output/Navy-Private_Sector_Critical_Infrastructure_2017.md document
Starting on ./final_docling_output/Naval_Services_Game_12.md document
Starting on ./final_docling_output/Naval_Services_Game13_Game_Report.md document
Starting on ./final_docling_output/Naval_Services_Game13_Analytic_Summary.md document



Created a chunk of size 1961, which is longer than the specified 1500
Created a chunk of size 1895, which is longer than the specified 1500
Created a chunk of size 2045, which is longer than the specified 1500
Created a chunk of size 1616, which is longer than the specified 1500


Loaded 6827 existing IDs from LanceDB.
Created 439 text chunks.


Processing chunks: 100%|██████████| 14/14 [06:11<00:00, 26.53s/it]



Finished processing.
Chunks inserted      : 439
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 439

 

Working documents: 100 to 103
Starting on ./final_docling_output/Olympiad_I-62.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Olympiad_I-62.md
error code: Invalid json output: Here is a JSON-formatted version of your requested output based on the provided schema:

 {
  "Title": "OLYMPIAD I-62",
  "Date Of Pub": "1962-12-01",
  "Agency": [
    "Assistant Secretary of Defense for International Security Affairs"
  ],
  "Domain": [
    "Land",
    "Maritime",
    "Air",
    "Space",
    "Cyberspace"
  ],
  "Country": [
    "United States",
    "United Kingdom"
  ],
  "Cocom": [
    "INDOPACOM"
  ],
  "Category": "Policy",
  "Purpose": "To explore and test the implications of different policy decisions and military strategies in a global context"
}

Note that I used examples provided in the schema to fi

Created a chunk of size 1556, which is longer than the specified 1500


Loaded 7266 existing IDs from LanceDB.
Created 124 text chunks.


Processing chunks: 100%|██████████| 4/4 [01:47<00:00, 26.76s/it]



Finished processing.
Chunks inserted      : 124
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 124

 

Working documents: 104 to 107
Starting on ./final_docling_output/paper_370.md document
error at ./final_docling_output/paper_370.md
error code: Failed to parse SearchSchema from completion {"title": "Tactical Engagement Simulation Training: A Method for Learning the Realities of Combat", "date_of_pub": "1979-08-01", "agency": ["U.S. Army Research Institute for the Behavioral and Social Sciences", "U.S. Army Training Support Center"], "domain": ["Land"], "country": ["United States"], "cocom": ["TRADOC"], "category": "process", "purpose": "The purpose of this wargame report is to describe a method for collective training for combat arms units, known as engagement simulation, which provides an environment for the training of tactical skills for a complete unit. The report also discusses a method for empirically determining which tactical behaviors are related t

Created a chunk of size 1668, which is longer than the specified 1500
Created a chunk of size 1724, which is longer than the specified 1500


Loaded 7390 existing IDs from LanceDB.
Created 65 text chunks.


Processing chunks: 100%|██████████| 3/3 [01:05<00:00, 21.81s/it]



Finished processing.
Chunks inserted      : 65
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 65

 

Working documents: 108 to 111
Starting on ./final_docling_output/Post-2014_Afghanistan_Wargame_Analysis.md document
Starting on ./final_docling_output/PORTABLE_CASTLE_Quick_Look_Final_20180601.md document
Starting on ./final_docling_output/POLITICA_and_AGILE-COIN_1966.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/POLITICA_and_AGILE-COIN_1966.md
error code: Failed to parse SearchSchema from completion {"title": "DEMONSTRATION OF TWO SIMULATIONS OF INTERNAL REVOLUTIONARY CONFLICT: POLITICA AND AGILE-COIN", "date_of_pub": "1967-01-01", "agency": ["Abt Associates Inc."], "domain": ["Land", "Cyberspace"], "country": ["Inertia"], "cocom": ["Other"], "category": "Policy", "purpose": "To simulate and understand the dynamics of internal conflict, specifically revolutionary conflict, and how different faction

Created a chunk of size 2124, which is longer than the specified 1500
Created a chunk of size 1595, which is longer than the specified 1500


Loaded 7455 existing IDs from LanceDB.
Created 133 text chunks.


Processing chunks: 100%|██████████| 5/5 [01:54<00:00, 22.83s/it]



Finished processing.
Chunks inserted      : 133
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 133

 

Working documents: 112 to 115
Starting on ./final_docling_output/PROSAIC TWAIN_Quick Look_V4.md document
Starting on ./final_docling_output/Proliferation_Security_Initiative_14.md document
Starting on ./final_docling_output/Project_Centaur__The_Wargame_Rules.md document
Starting on ./final_docling_output/PRESCIENT VISION Quick Look FINAL.md document

Loaded 7588 existing IDs from LanceDB.
Created 286 text chunks.


Processing chunks: 100%|██████████| 9/9 [05:19<00:00, 35.49s/it]



Finished processing.
Chunks inserted      : 286
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 286

 

Working documents: 116 to 119
Starting on ./final_docling_output/report_1164.md document
Starting on ./final_docling_output/Red_Gaming_in_Support_of_the_War_on_Terrorism_Sandia_Red_Game_Report.md document
Starting on ./final_docling_output/QUICK.md document
Starting on ./final_docling_output/QDR_Defense_Strategy.md document



Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 1768, which is longer than the specified 1500
Created a chunk of s

Loaded 7874 existing IDs from LanceDB.
Created 407 text chunks.


Processing chunks:  38%|███▊      | 5/13 [04:35<09:19, 69.92s/it]

Metadata extraction failed for ./final_docling_output/QUICK.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: a6d8ea63-9866-4470-bce3-7b104a2852dc)')


Processing chunks: 100%|██████████| 13/13 [09:30<00:00, 43.91s/it]



Finished processing.
Chunks inserted      : 406
Chunks skipped       : 0
Chunks failed (LLM)  : 1
Total chunked_docs   : 407
Failed chunks:
 - ./final_docling_output/QUICK.md

 

Working documents: 120 to 123
Starting on ./final_docling_output/Report_of_International_Game_96.md document
Starting on ./final_docling_output/Report_NATO_Exercises_1956.md document
Starting on ./final_docling_output/Report_IV_Crisis_Sim.md document
Starting on ./final_docling_output/Report_Crisis_Simulation_2009__EFSA.md document



Created a chunk of size 2145, which is longer than the specified 1500
Created a chunk of size 1519, which is longer than the specified 1500


Loaded 8280 existing IDs from LanceDB.
Created 346 text chunks.


Processing chunks: 100%|██████████| 11/11 [05:32<00:00, 30.26s/it]



Finished processing.
Chunks inserted      : 346
Chunks skipped       : 0
Chunks failed (LLM)  : 1
Total chunked_docs   : 346
Failed chunks:
 - ./final_docling_output/QUICK.md

 

Working documents: 124 to 127
Starting on ./final_docling_output/Setear_061990.md document
Starting on ./final_docling_output/SCYLLA_III-73_Quick_Look.md document
too many tokens, attempting to shorten document with 572189 characters
detected bad ocr, cleaning and retrying API call
Starting on ./final_docling_output/Revolution_in_Military_Affairs_2020_Vol_Il.md document
Starting on ./final_docling_output/Report_On_NATO_Exercises.md document



Created a chunk of size 1888, which is longer than the specified 1500
Created a chunk of size 1800, which is longer than the specified 1500
Created a chunk of size 1847, which is longer than the specified 1500
Created a chunk of size 1825, which is longer than the specified 1500
Created a chunk of size 1664, which is longer than the specified 1500
Created a chunk of size 1931, which is longer than the specified 1500
Created a chunk of size 1813, which is longer than the specified 1500
Created a chunk of size 2292, which is longer than the specified 1500
Created a chunk of size 1731, which is longer than the specified 1500
Created a chunk of size 1512, which is longer than the specified 1500
Created a chunk of size 1850, which is longer than the specified 1500
Created a chunk of size 3347, which is longer than the specified 1500
Created a chunk of size 1600, which is longer than the specified 1500


Loaded 8626 existing IDs from LanceDB.
Created 736 text chunks.


Processing chunks:  74%|███████▍  | 17/23 [13:20<07:45, 77.66s/it]

Metadata extraction failed for ./final_docling_output/SCYLLA_III-73_Quick_Look.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 18363a89-b5e8-42cf-b084-ac053736e852)')


Processing chunks: 100%|██████████| 23/23 [18:42<00:00, 48.79s/it]



Finished processing.
Chunks inserted      : 735
Chunks skipped       : 0
Chunks failed (LLM)  : 2
Total chunked_docs   : 736
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md

 

Working documents: 128 to 131
Starting on ./final_docling_output/Six_Demos_AGILE-COIN.md document
time out error, pausing for 10 seconds
Starting on ./final_docling_output/SIGNAL_Game_Manual.md document
Starting on ./final_docling_output/Shipbuilding_Game.md document
Starting on ./final_docling_output/Shaken_10-01_Table_Top_Exercise_After_Action_Report.md document



Created a chunk of size 1513, which is longer than the specified 1500
Created a chunk of size 1752, which is longer than the specified 1500


Loaded 9361 existing IDs from LanceDB.
Created 300 text chunks.


Processing chunks: 100%|██████████| 10/10 [04:38<00:00, 27.84s/it]



Finished processing.
Chunks inserted      : 300
Chunks skipped       : 0
Chunks failed (LLM)  : 2
Total chunked_docs   : 300
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md

 

Working documents: 132 to 135
Starting on ./final_docling_output/Syria_Analysis_Game.md document
Starting on ./final_docling_output/Stress_Testing_Financial_Crisis_2007.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Stress_Testing_Financial_Crisis_2007.md
error code: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: a182a7d2-57f3-4b5e-af15-5d53b097ae71)')
Starting on ./final_docling_output/STDE24_Fact_Sheet.md document
Starting on ./final_docling_output/SMITE.md document



Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500
Created a chunk of size 1769, which is longer than the specified 1500


Loaded 9661 existing IDs from LanceDB.
Created 211 text chunks.


Processing chunks: 100%|██████████| 7/7 [03:14<00:00, 27.78s/it]



Finished processing.
Chunks inserted      : 211
Chunks skipped       : 0
Chunks failed (LLM)  : 2
Total chunked_docs   : 211
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md

 

Working documents: 136 to 139
Starting on ./final_docling_output/URB-COIN_Game.md document
Starting on ./final_docling_output/TRIDENT_JUNCTURE_2015.md document
Starting on ./final_docling_output/Theater_Battle_Model_Volume_VII.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Theater_Battle_Model_Volume_VII.md
error code: Invalid json output: Here is the output that follows the specified schema:


{
  "title": "Theater Battle Model",
  "date_of_pub": "1968-01-01",
  "agency": [
    "Joint War Games Agency"
  ],
  "domain": {
    "items": [
      "Land",
      "Maritime",
      "Air",
      "Space",
      "Cyberspace"
    ]
  },
  "country": [
    "United States"
  ],
  "cocom": {
    "items": [
    

Created a chunk of size 1511, which is longer than the specified 1500
Created a chunk of size 1756, which is longer than the specified 1500


Loaded 9872 existing IDs from LanceDB.
Created 315 text chunks.


Processing chunks: 100%|██████████| 10/10 [05:19<00:00, 31.97s/it]



Finished processing.
Chunks inserted      : 315
Chunks skipped       : 0
Chunks failed (LLM)  : 2
Total chunked_docs   : 315
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md

 

Working documents: 140 to 143
Starting on ./final_docling_output/WAGCAP_Phase_II.md document
error at ./final_docling_output/WAGCAP_Phase_II.md
error code: Failed to parse SearchSchema from completion {"title": "IMPROVEMENT OF THE WAR GAMING CAPABILITY, PHASE II (WAGCAP II)", "date_of_pub": "1973-06-01", "agency": ["US Army Combat Developments Command", "Computer Sciences Corporation"], "domain": ["Land"], "country": ["United States"], "cocom": ["USACDC"], "category": "capability", "purpose": "The purpose of this wargame was to improve the war gaming capability of the US Army by developing and testing the Division War Game (DIVWAG) model."}. Got: 1 validation error for SearchSchema
cocom.0
  Input should be 'INDOPACOM', 'PACOM', 'CENTCOM', 'EUCOM', 'AFRICO

Created a chunk of size 1528, which is longer than the specified 1500
Created a chunk of size 1528, which is longer than the specified 1500
Created a chunk of size 1528, which is longer than the specified 1500
Created a chunk of size 1523, which is longer than the specified 1500
Created a chunk of size 1523, which is longer than the specified 1500
Created a chunk of size 1523, which is longer than the specified 1500
Created a chunk of size 1523, which is longer than the specified 1500
Created a chunk of size 1518, which is longer than the specified 1500
Created a chunk of size 1518, which is longer than the specified 1500
Created a chunk of size 1518, which is longer than the specified 1500
Created a chunk of size 1518, which is longer than the specified 1500


Loaded 10187 existing IDs from LanceDB.
Created 309 text chunks.


Processing chunks: 100%|██████████| 10/10 [04:48<00:00, 28.89s/it]



Finished processing.
Chunks inserted      : 309
Chunks skipped       : 0
Chunks failed (LLM)  : 2
Total chunked_docs   : 309
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md

 

Working documents: 144 to 147
Starting on ./final_docling_output/WAGCAP_Phase_II_AppD.md document
Starting on ./final_docling_output/WAGCAP_Phase_II_AppC.md document
too many tokens, attempting to shorten document with 992921 characters
detected bad ocr, cleaning and retrying API call
error at ./final_docling_output/WAGCAP_Phase_II_AppC.md
error code: Failed to parse SearchSchema from completion {"title": "IMPROVEMENT OF THE WAR-GAMING CAPABILITY, PHASE II", "date_of_pub": "1973-06-01", "agency": ["Computer Sciences Corporation", "US Army"], "domain": ["Land"], "country": ["United States"], "cocom": ["USACDC"], "category": "capability", "purpose": "The purpose of this wargame is to improve the war-gaming capability of the US Army by modifying the Division 

Created a chunk of size 1625, which is longer than the specified 1500
Created a chunk of size 2276, which is longer than the specified 1500
Created a chunk of size 1637, which is longer than the specified 1500
Created a chunk of size 1531, which is longer than the specified 1500
Created a chunk of size 1956, which is longer than the specified 1500
Created a chunk of size 1710, which is longer than the specified 1500
Created a chunk of size 1642, which is longer than the specified 1500
Created a chunk of size 1617, which is longer than the specified 1500
Created a chunk of size 1732, which is longer than the specified 1500
Created a chunk of size 1790, which is longer than the specified 1500
Created a chunk of size 1520, which is longer than the specified 1500
Created a chunk of size 1689, which is longer than the specified 1500
Created a chunk of size 1842, which is longer than the specified 1500
Created a chunk of size 1518, which is longer than the specified 1500
Created a chunk of s

Loaded 10496 existing IDs from LanceDB.
Created 1641 text chunks.


Processing chunks:  62%|██████▏   | 32/52 [18:25<22:26, 67.31s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 0e0cd374-f112-495b-9e32-9d85c246c168)')


Processing chunks:  90%|█████████ | 47/52 [28:36<05:07, 61.49s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 9c24007b-9c5e-47c8-a3e3-0c5135a0a513)')


Processing chunks: 100%|██████████| 52/52 [31:36<00:00, 36.46s/it]



Finished processing.
Chunks inserted      : 1639
Chunks skipped       : 0
Chunks failed (LLM)  : 4
Total chunked_docs   : 1641
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md

 

Working documents: 148 to 151
Starting on ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md document
too many tokens, attempting to shorten document with 816095 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/WAGCAP_Volume_IV.md document
too many tokens, attempting to shorten document with 601216 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/WAGCAP_Volume_III_DIVWAG_Tech_Manual.md document
too many tokens, attempting to shorten document with 892546 characters
detected bad ocr, cleaning and retrying API call
Starting on ./final_docling_output/W

Created a chunk of size 2104, which is longer than the specified 1500
Created a chunk of size 1717, which is longer than the specified 1500
Created a chunk of size 1534, which is longer than the specified 1500
Created a chunk of size 1853, which is longer than the specified 1500
Created a chunk of size 1826, which is longer than the specified 1500
Created a chunk of size 1613, which is longer than the specified 1500
Created a chunk of size 1988, which is longer than the specified 1500
Created a chunk of size 2130, which is longer than the specified 1500
Created a chunk of size 1977, which is longer than the specified 1500
Created a chunk of size 1585, which is longer than the specified 1500
Created a chunk of size 1559, which is longer than the specified 1500
Created a chunk of size 1504, which is longer than the specified 1500
Created a chunk of size 1545, which is longer than the specified 1500
Created a chunk of size 1512, which is longer than the specified 1500
Created a chunk of s

Loaded 12135 existing IDs from LanceDB.
Created 1850 text chunks.


Processing chunks:  91%|█████████▏| 53/58 [32:11<05:51, 70.20s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 3dbf65da-e582-435d-a5b7-93548c30680d)')


Processing chunks: 100%|██████████| 58/58 [35:00<00:00, 36.21s/it]



Finished processing.
Chunks inserted      : 1849
Chunks skipped       : 0
Chunks failed (LLM)  : 5
Total chunked_docs   : 1850
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md

 

Working documents: 152 to 155
Starting on ./final_docling_output/WAGCAP_Volume_V_Part_II_DIVWAG_Prog_Manual.md document
too many tokens, attempting to shorten document with 805097 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md document
too many tokens, attempting to shorten document with 820016 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md document
too many tokens, attempting to shorten document with 1486762 

Created a chunk of size 1763, which is longer than the specified 1500
Created a chunk of size 4797, which is longer than the specified 1500
Created a chunk of size 1637, which is longer than the specified 1500
Created a chunk of size 2262, which is longer than the specified 1500
Created a chunk of size 1563, which is longer than the specified 1500
Created a chunk of size 1525, which is longer than the specified 1500
Created a chunk of size 2079, which is longer than the specified 1500
Created a chunk of size 1848, which is longer than the specified 1500
Created a chunk of size 1572, which is longer than the specified 1500
Created a chunk of size 2190, which is longer than the specified 1500
Created a chunk of size 1600, which is longer than the specified 1500
Created a chunk of size 2065, which is longer than the specified 1500
Created a chunk of size 1560, which is longer than the specified 1500
Created a chunk of size 1856, which is longer than the specified 1500
Created a chunk of s

Loaded 13984 existing IDs from LanceDB.
Created 2446 text chunks.


Processing chunks:  42%|████▏     | 32/77 [19:16<29:57, 39.94s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 45a4b7d2-273c-4c0e-b2c8-34617d888a53)')


Processing chunks:  43%|████▎     | 33/77 [21:23<48:29, 66.12s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: d6e80791-a22b-4922-a479-e076d5317223)')


Processing chunks:  48%|████▊     | 37/77 [25:38<49:29, 74.24s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: cb996fbf-07fe-485c-86e5-e90dca1944e9)')


Processing chunks:  53%|█████▎    | 41/77 [30:09<49:44, 82.90s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: e75c0ae4-f2e8-44ce-9bc7-efab094c18a3)')


Processing chunks:  73%|███████▎  | 56/77 [41:35<25:33, 73.03s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 7e112fc2-e4c6-4972-9281-30008477990f)')


Processing chunks:  90%|████████▉ | 69/77 [52:39<08:49, 66.14s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_V_Part_II_DIVWAG_Prog_Manual.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 09d94d7f-cf92-414f-9b3e-6a915774f420)')


Processing chunks: 100%|██████████| 77/77 [57:06<00:00, 44.50s/it]



Finished processing.
Chunks inserted      : 2440
Chunks skipped       : 0
Chunks failed (LLM)  : 11
Total chunked_docs   : 2446
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md
 ...and 6 more.

 

Working documents: 156 to 159
Starting on ./final_docling_output/WG_6-13_CTOC_Report.md document
Starting on ./final_docling_output/Warriors_Edge.md document
error at ./final_docling_output/Warriors_Edge.md
error code: Failed to parse SearchSchema from completion {"title": "Warriors Edge Simulation and Gaming System: The Squad Simulation", "date_of_pub": "2005-08-01", "agency": ["U.S. Army Research Laboratory"], "domain": ["Land"], "country": ["United States"], "cocom": ["None"], "category": "capability", "purpose": "The purpose of the wargame is to develop a simulation 

Created a chunk of size 1712, which is longer than the specified 1500
Created a chunk of size 1583, which is longer than the specified 1500
Created a chunk of size 2151, which is longer than the specified 1500
Created a chunk of size 1568, which is longer than the specified 1500
Created a chunk of size 1916, which is longer than the specified 1500
Created a chunk of size 1916, which is longer than the specified 1500
Created a chunk of size 1916, which is longer than the specified 1500
Created a chunk of size 1916, which is longer than the specified 1500
Created a chunk of size 1792, which is longer than the specified 1500


Loaded 16424 existing IDs from LanceDB.
Created 1180 text chunks.


Processing chunks:  11%|█         | 4/37 [02:04<17:33, 31.93s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: bcb7359b-b464-46d4-942c-a08a57572858)')


Processing chunks:  14%|█▎        | 5/37 [04:31<39:03, 73.24s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 89b5be9e-c2a4-4a4c-8163-ced8d0a85dae)')


Processing chunks:  68%|██████▊   | 25/37 [18:51<14:07, 70.60s/it]

Metadata extraction failed for ./final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: bf489315-20dd-4bab-90fd-3b6d77cd730a)')


Processing chunks: 100%|██████████| 37/37 [23:51<00:00, 38.69s/it]



Finished processing.
Chunks inserted      : 1177
Chunks skipped       : 0
Chunks failed (LLM)  : 14
Total chunked_docs   : 1180
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md
 ...and 9 more.

 

Working documents: 160 to 161
Starting on ./final_docling_output/wmd_Wargaming_1994.md document
too many tokens, attempting to shorten document with 894102 characters
detected bad ocr, cleaning and retrying API call
Starting on ./final_docling_output/Wing_and_a_Prayer.md document
error at ./final_docling_output/Wing_and_a_Prayer.md
error code: Failed to parse SearchSchema from completion {"title": "A Wing and a Prayer", "date_of_pub": "2009-04-01", "agency": ["USDA-APHIS", "CNA"], "domain": ["Land"], "country": ["United States"], "cocom": ["USDA-APHIS"], "category": "pro

Created a chunk of size 1632, which is longer than the specified 1500
Created a chunk of size 1566, which is longer than the specified 1500
Created a chunk of size 1777, which is longer than the specified 1500
Created a chunk of size 1649, which is longer than the specified 1500
Created a chunk of size 1681, which is longer than the specified 1500
Created a chunk of size 1569, which is longer than the specified 1500
Created a chunk of size 1712, which is longer than the specified 1500
Created a chunk of size 1523, which is longer than the specified 1500
Created a chunk of size 1542, which is longer than the specified 1500
Created a chunk of size 1555, which is longer than the specified 1500
Created a chunk of size 1845, which is longer than the specified 1500
Created a chunk of size 1809, which is longer than the specified 1500
Created a chunk of size 2061, which is longer than the specified 1500
Created a chunk of size 1965, which is longer than the specified 1500
Created a chunk of s

Loaded 17601 existing IDs from LanceDB.
Created 772 text chunks.


Processing chunks: 100%|██████████| 25/25 [11:50<00:00, 28.40s/it]



Finished processing.
Chunks inserted      : 772
Chunks skipped       : 0
Chunks failed (LLM)  : 14
Total chunked_docs   : 772
Failed chunks:
 - ./final_docling_output/QUICK.md
 - ./final_docling_output/SCYLLA_III-73_Quick_Look.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
 - ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md
 ...and 9 more.

 



In [29]:
failed_chunks

['./final_docling_output/QUICK.md',
 './final_docling_output/SCYLLA_III-73_Quick_Look.md',
 './final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md',
 './final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md',
 './final_docling_output/WAGCAP_Volume_VII_DIVWAG.md',
 './final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md',
 './final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md',
 './final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md',
 './final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md',
 './final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md',
 './final_docling_output/WAGCAP_Volume_V_Part_II_DIVWAG_Prog_Manual.md',
 './final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md',
 './final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md',
 './final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md']

In [None]:
failed_paths = [
    "./final_docling_output/CAMMS_v_CPX.md",
    "./final_docling_output/Counter-Insurgency_Study.md",
    "./final_docling_output/Commercial_Satellite_Report.md",
    "./final_docling_output/ExerciseViking22_Final_Report.md",
    "./final_docling_output/Olympiad_I-62.md",
    "./final_docling_output/ODNA_Transformation_Strategy_GameIV.md",
    "./final_docling_output/paper_370.md",
    "./final_docling_output/POLITICA_and_AGILE-COIN_1966.md",
    "./final_docling_output/QUICK.md",
    "./final_docling_output/WAGCAP_Volume_VII_Testing_Report.md",
    "./final_docling_output/WAGCAP_Volume_VI_DIVWAG_Data_Req.md",
    "./final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md",
    "./final_docling_output/WAGCAP_Volume_V_Part_II_DIVWAG_Prog_Manual.md",
    "./final_docling_output/SCYLLA_III-73_Quick_Look.md",
    "./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md",
    "./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md",
    "./final_docling_output/Warriors_Edge.md",
    "./final_docling_output/WAGCAP_Volume_V_Part_I_DIVWAG_Prog_Manual.md",
    "./final_docling_output/Wing_and_a_Prayer.md",
    "./final_docling_output/WAGCAP_Phase_II_AppC.md",
    "./final_docling_output/WAGCAP_Volume_I.md",
    "./final_docling_output/Stress_Testing_Financial_Crisis_2007.md",
    "./final_docling_output/Theater_Battle_Model_Volume_VII.md",
    "./final_docling_output/WAGCAP_Phase_II.md"
]
# get list of paths to md files in specified directory
directory = "./final_docling_output"
doc_paths = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('.md')]

# Compare lists
missing_from_master = [path for path in failed_paths if path not in doc_paths]

# Output results
if missing_from_master:
    print("The following failed paths are NOT in the master list:")
    for path in missing_from_master:
        print(f" - {path}")
else:
    print("All failed paths are present in the master list.")





All failed paths are present in the master list.


In [31]:
docs = [TextLoader(doc_path).load() for doc_path in failed_paths]
docs_flat_partial = [item for sublist in docs for item in sublist]
len(docs_flat_partial)

24

In [None]:
#### Run again against failed paths

docs_flat=docs_flat_partial #allows you to test the code without running the whole dang thing
doc_batch_size=4
full_doc_metadata={}
error_log = {} 
failed_chunks = []
# Suppress deprecation warnings
warnings.filterwarnings("ignore", category=PydanticDeprecatedSince20)
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")
for j in range(0, len(docs_flat), doc_batch_size):
    remove_doc=[]
    cutoff=j+doc_batch_size-1
    if cutoff>len(docs_flat):
        cutoff=len(docs_flat)-1
    print(f'Working documents: {j} to {cutoff}')
    current_docs=docs_flat[j:j+doc_batch_size]
    full_doc_metadata={}
    for doc in reversed(current_docs): 
        print(f'Starting on {doc.metadata['source']} document')
        try:
            json_output=chain.invoke({"wargame_report":doc.page_content})  
            full_doc_metadata[doc.metadata['source']]=json_output.model_dump()
        except Exception as e:
            if "422" in str(e):
                print(f"too many tokens, attempting to shorten document with {len(doc.page_content)} characters")
                time.sleep(5)
                try:
                    if is_bad_ocr(doc.page_content):
                        print("detected bad ocr, cleaning and retrying API call")
                        text=clean_ocr_text(doc.page_content)
                        text=doc.page_content[:150000] +"..."                        
                        json_output=chain.invoke({"wargame_report":text})  
                        full_doc_metadata[doc.metadata['source']]=json_output.model_dump() 
                    else:
                        print("good ocr, document is just too long, trimming end") 
                        text=clean_ocr_text(doc.page_content)
                        text=doc.page_content[:150000] +"..."
                        json_output=chain.invoke({"wargame_report":text})  
                        full_doc_metadata[doc.metadata['source']]=json_output.model_dump() 
                except Exception as f:
                    print(f'error at {doc.metadata['source']}')
                    print(f'error code: {f}')
                    error_log[doc.metadata['source']]= str(f)
                    current_docs.remove(doc)
            elif "Read timed out." in str(e):
                print("time out error, pausing for 10 seconds")
                time.sleep(10)
                try:
                    json_output=chain.invoke({"wargame_report":doc.page_content})  
                    full_doc_metadata[doc.metadata['source']]=json_output.model_dump() 
                except Exception as f:
                    print("time out error not fixed")
                    print(f'error at {doc.metadata['source']}')
                    print(f'error code: {f}')
                    error_log[doc.metadata['source']]= str(f)
                    current_docs.remove(doc)          
           # if str(e)
            else:
                print(f'error at {doc.metadata['source']}')
                print(f'error code: {e}')
                error_log[doc.metadata['source']]= str(e)
                current_docs.remove(doc)

    # Create IDs for each chunk so we're not inserting duplicates
    existing_ids = set()
    print()
    try:
        existing_ids = set(row["id"] for row in table.to_arrow().to_pylist())
        print(f"Loaded {len(existing_ids)} existing IDs from LanceDB.")
    except Exception as e:
        print(f"Warning: Could not load existing IDs. Starting fresh. Reason: {e}")

    # Initialize counters
    skipped = 0
    processed = 0


    # Batch size for LLM + TEI
    batch_size = 32
    updates = []
    texts_to_embed = []
    rows_to_embed = []

    #remove docs where API call failed

    # Apply splitter to your loaded docs
    chunked_docs = []
    for doc in current_docs:
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            chunked_docs.append({
                "text": chunk,
                "file_path": doc.metadata.get("source", ""),
                "chunk_index": i
            })

    print(f"Created {len(chunked_docs)} text chunks.")
    # Processing loop
    for i in tqdm(range(0, len(chunked_docs), batch_size), desc="Processing chunks"):
        batch = chunked_docs[i:i + batch_size]

        # Filter out already-inserted chunks
        filtered_batch = []
        for chunk in batch:
            row_id = f"{chunk['file_path']}#{chunk['chunk_index']}"
            if row_id not in existing_ids:
                filtered_batch.append((row_id, chunk))
            else:
                skipped += 1

        if not filtered_batch:
            continue

        # Parallelize metadata extraction
        # Create worker threads
        with ThreadPoolExecutor(max_workers=5) as executor:
            # Submit tasks to the executor
            futures = {
                executor.submit(extract_metadata, chunk["text"], chunk["file_path"]): (row_id, chunk)
                for row_id, chunk in filtered_batch
            }
            # Process results as they complete
            for future in as_completed(futures):
                try:
                    metadata = future.result()
                    row_id, chunk = futures[future]
                    # Add the remaining schema-required fields
                    # ** unpack the metadata dict
                    row_metadata = ChunkMetadata(
                        **{
                            **metadata,
                            **full_doc_metadata[chunk["file_path"]]
                        }
                    ).model_dump()
                    # Package into full LanceDB-compatible row
                    rows_to_embed.append({
                        "id": row_id,
                        "text": chunk["text"],
                        "metadata": row_metadata
                    })
                    texts_to_embed.append(chunk["text"])
                    processed += 1

                except Exception as e:
                    file_path = futures[future][1]['file_path']
                    print(f"Metadata extraction failed for {file_path}: {e}")
                    failed_chunks.append(file_path)

        # Run TEI Embedding
        if texts_to_embed:
            # Embed the texts in batches
            vectors = safe_embed_documents(rows_to_embed)

            # Add vectors to the rows
            # zip the vectors with the rows of text
            for row, vector in zip(rows_to_embed, vectors):
                updates.append({
                    "id": row["id"],
                    "vector": vector,
                    "text": row["text"],
                    "metadata": row["metadata"]
                })

            existing_ids.update(row["id"] for row in rows_to_embed)
            rows_to_embed = []
            texts_to_embed = []

        # Add LanceDB updates in batches
        # Check if the updates list has reached the batch size
        if len(updates) >= 256:
            table.add(updates)
            updates = []

    # Only for the last batch which may not be full
    if texts_to_embed:
        vectors = safe_embed_documents(rows_to_embed)

        for row, vector in zip(rows_to_embed, vectors):
            updates.append({
                "id": row["id"],
                "vector": vector,
                "text": row["text"],
                "metadata": row["metadata"]
            })
        existing_ids.update(row["id"] for row in rows_to_embed)
    # final catch for updates
    if updates:
        table.add(updates)

    # Final Report
    print("\nFinished processing.")
    print(f"Chunks inserted      : {processed}")
    print(f"Chunks skipped       : {skipped}")
    print(f"Chunks failed (LLM)  : {len(failed_chunks)}")
    print(f"Total chunked_docs   : {len(chunked_docs)}")

    if failed_chunks:
        print("Failed chunks:")
        for path in failed_chunks[:5]:
            print(f" - {path}")
        if len(failed_chunks) > 5:
            print(f" ...and {len(failed_chunks) - 5} more.")

    print("\n \n") 

Working documents: 0 to 3
Starting on ./final_docling_output/ExerciseViking22_Final_Report.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/ExerciseViking22_Final_Report.md
error code: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 90214190-c011-43fb-b271-b05c88cb83e8)')
Starting on ./final_docling_output/Commercial_Satellite_Report.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Commercial_Satellite_Report.md
error code: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: 5030e6a6-5ed9-46f2-a1e4-79df980b50c4)')
Starting on ./final_docling_output/Counter-Insurgency_Study.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Counter-Insurgency_Study

Processing chunks: 0it [00:00, ?it/s]


Finished processing.
Chunks inserted      : 0
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 0

 

Working documents: 4 to 7
Starting on ./final_docling_output/POLITICA_and_AGILE-COIN_1966.md document





time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/POLITICA_and_AGILE-COIN_1966.md
error code: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: a8a29e8b-04d5-4ca8-b67c-1bbeed51bedf)')
Starting on ./final_docling_output/paper_370.md document
Starting on ./final_docling_output/ODNA_Transformation_Strategy_GameIV.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/ODNA_Transformation_Strategy_GameIV.md
error code: (ReadTimeoutError("HTTPConnectionPool(host='trac-malenia.ern.nps.edu', port=8080): Read timed out. (read timeout=120)"), '(Request ID: fd25ef52-2231-4634-ac52-9b283ac8ae0e)')
Starting on ./final_docling_output/Olympiad_I-62.md document
time out error, pausing for 10 seconds
time out error not fixed
error at ./final_docling_output/Olympiad_I-62.md
error code: (ReadTimeoutError("HTTPConnectionPool

Processing chunks: 100%|██████████| 2/2 [00:47<00:00, 23.61s/it]



Finished processing.
Chunks inserted      : 51
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 51

 

Working documents: 8 to 11
Starting on ./final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md document
too many tokens, attempting to shorten document with 820016 characters
good ocr, document is just too long, trimming end
error at ./final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md
error code: Failed to parse SearchSchema from completion {"title": "Improvement of the War Gaming Capability (WAGCAP)", "date_of_pub": "1972-08-01", "agency": ["US Army Combat Developments Command", "Computer Sciences Corporation"], "domain": ["Land"], "country": ["United States"], "cocom": ["OTHER"], "category": "process", "purpose": "The purpose of this wargame is to improve the war-gaming capability of the US Army by developing a new wargame model, known as the Division Wargame (DIVWAG), which can be used to simulate various military scenarios a

Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 2155, which is longer than the specified 1500
Created a chunk of size 1763, which is longer than the specified 1500
Created a chunk of s

Loaded 18424 existing IDs from LanceDB.
Created 1374 text chunks.


Processing chunks: 100%|██████████| 43/43 [00:10<00:00,  3.96it/s]



Finished processing.
Chunks inserted      : 4
Chunks skipped       : 1370
Chunks failed (LLM)  : 0
Total chunked_docs   : 1374

 

Working documents: 12 to 15
Starting on ./final_docling_output/WAGCAP_Volume_VII_DIVWAG.md document
too many tokens, attempting to shorten document with 816095 characters
good ocr, document is just too long, trimming end
Starting on ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md document
too many tokens, attempting to shorten document with 894488 characters
good ocr, document is just too long, trimming end
error at ./final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md
error code: Failed to parse SearchSchema from completion {"title": "AD-768 163: IMPROVEMENT OF THE WAR-GAMING CAPABILITY, PHASE II", "date_of_pub": "1973-06-01", "agency": ["Combat Developments Command", "Computer Sciences Corporation"], "domain": ["Land"], "country": ["United States"], "cocom": ["USACACD"], "category": "capability", "purpose": "The purpose of this wargame is to impr

Created a chunk of size 1855, which is longer than the specified 1500
Created a chunk of size 1748, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of size 1501, which is longer than the specified 1500
Created a chunk of s

Loaded 18428 existing IDs from LanceDB.
Created 1748 text chunks.


Processing chunks: 100%|██████████| 55/55 [00:10<00:00,  5.23it/s]



Finished processing.
Chunks inserted      : 3
Chunks skipped       : 1745
Chunks failed (LLM)  : 0
Total chunked_docs   : 1748

 

Working documents: 16 to 19
Starting on ./final_docling_output/WAGCAP_Phase_II_AppC.md document
too many tokens, attempting to shorten document with 992921 characters
detected bad ocr, cleaning and retrying API call
error at ./final_docling_output/WAGCAP_Phase_II_AppC.md
error code: Failed to parse SearchSchema from completion {"title": "IMPROVEMENT OF THE WAR-GAMING CAPABILITY, PHASE II", "date_of_pub": "1973-06-01", "agency": ["US Army", "Computer Sciences Corporation"], "domain": ["Land"], "country": ["United States"], "cocom": ["USACDC"], "category": "capability", "purpose": "The purpose of this wargame is to improve the war-gaming capability of the US Army by modifying and maintaining the Division War Game (DIVWAG) model."}. Got: 1 validation error for SearchSchema
cocom.0
  Input should be 'INDOPACOM', 'PACOM', 'CENTCOM', 'EUCOM', 'AFRICOM', 'NORTHCO

Created a chunk of size 1712, which is longer than the specified 1500
Created a chunk of size 1583, which is longer than the specified 1500
Created a chunk of size 2151, which is longer than the specified 1500


Loaded 18431 existing IDs from LanceDB.
Created 918 text chunks.


Processing chunks: 100%|██████████| 29/29 [01:55<00:00,  3.98s/it]



Finished processing.
Chunks inserted      : 129
Chunks skipped       : 789
Chunks failed (LLM)  : 0
Total chunked_docs   : 918

 

Working documents: 20 to 23
Starting on ./final_docling_output/WAGCAP_Phase_II.md document
error at ./final_docling_output/WAGCAP_Phase_II.md
error code: Failed to parse SearchSchema from completion {"title": "Improvement of the War Gaming Capability, Phase II (WAGCAP II)", "date_of_pub": "1973-06-01", "agency": ["US Army Combat Developments Command"], "domain": ["Land"], "country": ["United States"], "cocom": ["USACDC"], "category": "capability", "purpose": "The purpose of this report is to describe the work performed and results achieved by Computer Sciences Corporation's Combat Developments Research Office in response to USACDC Work Directive 6-72, Improvement of the War Gaming Capability, Phase II."}. Got: 1 validation error for SearchSchema
cocom.0
  Input should be 'INDOPACOM', 'PACOM', 'CENTCOM', 'EUCOM', 'AFRICOM', 'NORTHCOM', 'SOUTHCOM', 'SPACECOM

Created a chunk of size 1582, which is longer than the specified 1500
Created a chunk of size 1560, which is longer than the specified 1500


Loaded 18560 existing IDs from LanceDB.
Created 101 text chunks.


Processing chunks: 100%|██████████| 4/4 [01:45<00:00, 26.48s/it]


Finished processing.
Chunks inserted      : 101
Chunks skipped       : 0
Chunks failed (LLM)  : 0
Total chunked_docs   : 101

 






In [38]:
list(error_log.keys())

['./final_docling_output/ExerciseViking22_Final_Report.md',
 './final_docling_output/Commercial_Satellite_Report.md',
 './final_docling_output/Counter-Insurgency_Study.md',
 './final_docling_output/CAMMS_v_CPX.md',
 './final_docling_output/POLITICA_and_AGILE-COIN_1966.md',
 './final_docling_output/ODNA_Transformation_Strategy_GameIV.md',
 './final_docling_output/Olympiad_I-62.md',
 './final_docling_output/WAGCAP_Volume_V_Part_III_DIVWAG_Prog_Manual.md',
 './final_docling_output/WAGCAP_Volume_VII_Testing_Report.md',
 './final_docling_output/WAGCAP_Phase_II_AppB_Part_II.md',
 './final_docling_output/WAGCAP_Phase_II_AppC.md',
 './final_docling_output/Warriors_Edge.md',
 './final_docling_output/WAGCAP_Phase_II.md',
 './final_docling_output/Theater_Battle_Model_Volume_VII.md',
 './final_docling_output/Stress_Testing_Financial_Crisis_2007.md']