In [1]:
# 1️⃣ Mount Google Drive (Colab only)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install python-docx pdfplumber python-pptx unstructured pytesseract
!apt install tesseract-ocr
# 1. Install pdf2image
!pip install pdf2image

# 2. Install poppler (required for pdf2image to work)
!apt-get install -y poppler-utils


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m446.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting unstructured
  Downloading unstructured-0.18.11-py3-none-any.whl.metadata (24 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting XlsxWriter>=0.5.7

In [3]:
import os
import json
import uuid
import textwrap
from pathlib import Path
import pytesseract
from PIL import Image
from docx import Document
from pptx import Presentation
import pdfplumber
from pdf2image import convert_from_path

In [4]:
# ✅ Folder setup
BASE_DIR = Path("/content/drive/MyDrive/Ethos LLM/Project_Root/06_LLM_Knowledge_Base")
OUTPUT_DIR = BASE_DIR / "_metadata"
OUTPUT_DIR.mkdir(exist_ok=True)

SUPPORTED_EXTS = [".pdf", ".docx", ".pptx"]

In [5]:
# ✅ 1. Get all documents
def get_all_documents(base_dir=BASE_DIR):
    files = []
    for subfolder in base_dir.iterdir():
        if subfolder.is_dir() and subfolder.name != "_metadata":
            for file in subfolder.rglob("*"):
                if file.suffix.lower() in SUPPORTED_EXTS:
                    files.append({
                        "path": file,
                        "category": subfolder.name,
                        "filename": file.name
                    })
    return files

In [6]:
# ✅ 2. Text extractors (with smart OCR fallback and logging)

from pdf2image import convert_from_path
import pdfplumber
import pytesseract
from docx import Document
from pptx import Presentation

def extract_text_from_pdf(file_path, min_text_length=30):
    full_text = ""

    try:
        with pdfplumber.open(file_path) as pdf:
            total_pages = len(pdf.pages)

            for i, page in enumerate(pdf.pages):
                print(f"🧾 [{file_path.name}] Extracting page {i+1}/{total_pages}...")

                combined_text = ""

                # 1. Try standard text extraction
                text_only = page.extract_text() or ""

                if len(text_only.strip()) >= min_text_length:
                    # ✅ Enough text found, no OCR needed
                    combined_text = text_only.strip()
                else:
                    # ⚠️ Run OCR if text is missing or too short
                    print(f"🔍 Running OCR on page {i+1}/{total_pages} of {file_path.name}...")
                    try:
                        images = convert_from_path(
                            str(file_path), dpi=300,
                            first_page=i+1, last_page=i+1
                        )
                        ocr_text = pytesseract.image_to_string(images[0])
                        combined_text = f"[OCR only:]\n{ocr_text.strip()}"
                    except Exception as e:
                        print(f"⚠️ OCR failed on page {i+1} of {file_path.name}: {e}")
                        combined_text = ""

                full_text += combined_text + "\n\n"

    except Exception as e:
        print(f"❌ Failed to open PDF {file_path.name}: {e}")
        return ""

    return full_text


def extract_text_from_docx(file_path):
    try:
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error reading DOCX {file_path}: {e}")
        return ""


def extract_text_from_pptx(file_path):
    try:
        prs = Presentation(file_path)
        text_runs = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text_runs.append(shape.text)
        return "\n".join(text_runs)
    except Exception as e:
        print(f"Error reading PPTX {file_path}: {e}")
        return ""


def extract_text(file_record):
    path = file_record["path"]
    ext = path.suffix.lower()
    if ext == ".pdf":
        return extract_text_from_pdf(path)
    elif ext == ".docx":
        return extract_text_from_docx(path)
    elif ext == ".pptx":
        return extract_text_from_pptx(path)
    else:
        return ""


In [7]:
# ✅ 3. Chunking
def chunk_text(text, chunk_size=800, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

In [8]:
def load_processed_files(jsonl_path):
    processed_files = set()
    if os.path.exists(jsonl_path):
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    record = json.loads(line)
                    processed_files.add(record["filename"])
                except:
                    continue
    return processed_files


In [9]:
def process_documents_to_chunks_streamed(output_path=OUTPUT_DIR / "document_chunks.jsonl"):
    all_files = get_all_documents()
    processed_files = load_processed_files(output_path)

    print(f"🗂 Found {len(all_files)} files. Skipping {len(processed_files)} already processed...")

    # Open file once for streaming write
    with open(output_path, "a", encoding="utf-8") as f:  # use 'append' mode!
        for file_record in tqdm(all_files, desc="Processing docs"):
            if file_record["filename"] in processed_files:
                continue

            print(f"📄 Now processing: {file_record['category']} → {file_record['filename']}")

            try:
                raw_text = extract_text(file_record)
                chunks = chunk_text(raw_text)

                for i, chunk in enumerate(chunks):
                    chunk_data = {
                        "id": str(uuid.uuid4()),
                        "text": chunk.strip(),
                        "chunk_index": i,
                        "filename": file_record["filename"],
                        "category": file_record["category"],
                        "source_path": str(file_record["path"]),
                    }
                    json.dump(chunk_data, f)
                    f.write("\n")

            except Exception as e:
                print(f"❌ Failed to process {file_record['filename']}: {e}")


In [10]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [11]:
import faiss
import json
import numpy as np
from openai import OpenAI

client = OpenAI(api_key="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")  # Automatically uses your API key from environment

# Load chunks from .jsonl
def load_chunks(jsonl_path):
    chunks = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

# Batch OpenAI embeddings using the new API client
def embed_texts(texts, model="text-embedding-3-small"):
    embeddings = []
    for i in range(0, len(texts), 20):
        batch = texts[i:i+20]
        try:
            response = client.embeddings.create(input=batch, model=model)
            vectors = [d.embedding for d in response.data]
            embeddings.extend(vectors)
        except Exception as e:
            print(f"❌ Embedding error at batch {i}: {e}")
    return np.array(embeddings).astype("float32")

# Create FAISS index and save metadata
def build_faiss_index(
    chunks,
    index_path=OUTPUT_DIR / "document_index.faiss",
    metadata_path=OUTPUT_DIR / "document_metadata.json"
):
    texts = [chunk["text"] for chunk in chunks]
    vectors = embed_texts(texts)

    if len(vectors) == 0:
        print("❌ No embeddings were generated. Check for OpenAI errors.")
        return

    dim = len(vectors[0])
    index = faiss.IndexFlatL2(dim)
    index.add(vectors)

    faiss.write_index(index, str(index_path))

    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2)

    print(f"✅ FAISS index and metadata saved to:\n{index_path}\n{metadata_path}")


In [12]:
from tqdm import tqdm

# Step 1: Generate chunks
process_documents_to_chunks_streamed()


# Step 2: Load and embed
chunks = load_chunks(OUTPUT_DIR / "document_chunks.jsonl")
build_faiss_index(chunks)


🗂 Found 17 files. Skipping 17 already processed...


Processing docs: 100%|██████████| 17/17 [00:00<00:00, 96616.76it/s]


✅ FAISS index and metadata saved to:
/content/drive/MyDrive/Ethos LLM/Project_Root/06_LLM_Knowledge_Base/_metadata/document_index.faiss
/content/drive/MyDrive/Ethos LLM/Project_Root/06_LLM_Knowledge_Base/_metadata/document_metadata.json


In [13]:
import faiss
import openai
import json
import numpy as np
from pathlib import Path

# Paths
BASE_PATH = Path("/content/drive/MyDrive/Ethos LLM/Project_Root/06_LLM_Knowledge_Base/_metadata")
CHUNKS_PATH = BASE_PATH / "document_chunks.jsonl"
INDEX_PATH = BASE_PATH / "document_index.faiss"
METADATA_PATH = BASE_PATH / "document_metadata.json"

# Load chunks
def load_chunks(path):
    chunks = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

# Embed with OpenAI
def embed_texts(texts, model="text-embedding-3-small"):
    vectors = []
    for i in range(0, len(texts), 20):
        batch = texts[i:i+20]
        try:
            response = openai.Embedding.create(input=batch, model=model)
            vectors.extend([d["embedding"] for d in response["data"]])
        except Exception as e:
            print(f"❌ Embedding failed at batch {i}: {e}")
    return np.array(vectors).astype("float32")

# Build FAISS index
def build_faiss_index(chunks, index_path=INDEX_PATH, metadata_path=METADATA_PATH):
    texts = [chunk["text"] for chunk in chunks]
    vectors = embed_texts(texts)
    index = faiss.IndexFlatL2(len(vectors[0]))
    index.add(vectors)

    faiss.write_index(index, str(index_path))
    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2)

    print(f"✅ Saved FAISS index to {index_path}")
    print(f"✅ Saved metadata to {metadata_path}")


In [14]:
# Reload FAISS index and metadata from disk
index = faiss.read_index(str(INDEX_PATH))
with open(METADATA_PATH, "r", encoding="utf-8") as f:
    metadata = json.load(f)


In [15]:
from openai import OpenAI, OpenAIError
import faiss
import json
import numpy as np
from pathlib import Path

# 🔐 Replace this with your actual API key (keep secret)
client = OpenAI(api_key="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")  # or use: client = OpenAI() if key is in env

# 📁 Paths to index + metadata
BASE_PATH = Path("/content/drive/MyDrive/Ethos LLM/Project_Root/06_LLM_Knowledge_Base/_metadata")
INDEX_PATH = BASE_PATH / "document_index.faiss"
METADATA_PATH = BASE_PATH / "document_metadata.json"

# 🔍 Query the knowledgebase
def query_knowledgebase(question, k=5):
    try:
        # 🔹 Embed the user question using OpenAI's new SDK
        response = client.embeddings.create(
            input=question,
            model="text-embedding-3-small"
        )
        query_embedding = response.data[0].embedding
    except OpenAIError as e:
        print(f"❌ Embedding failed: {e}")
        return None

    # 🔹 Load FAISS index and metadata
    index = faiss.read_index(str(INDEX_PATH))
    with open(METADATA_PATH, "r", encoding="utf-8") as f:
        metadata = json.load(f)

    # 🔹 Search FAISS with the question embedding
    D, I = index.search(np.array([query_embedding], dtype="float32"), k)

    # 🔹 Collect top-k matching chunks
    results = []
    for i in I[0]:
        if i < len(metadata):
            entry = metadata[i]
            results.append({
                "source": f"{entry['category']} → {entry['filename']}",
                "text": entry['text']
            })

    return results


In [16]:
def answer_with_reasoning(question, chunks):
    context = "\n\n".join([f"Source:\n{c['text']}" for c in chunks])

    glossary = """
Glossary (for this domain):

- E&O: Excess and Obsolete Inventory
- S&OP: Sales and Operations Planning
- ATP: Available to Promise
- WIP: Work in Progress
- MRP: Material Requirements Planning
- MOQ: Minimum Order Quantity
- SKU Rationalization: Reducing redundant SKUs to optimize efficiency
- Forecast Bias: Systematic error in demand forecasting
- Inventory Turns: Frequency of inventory turnover in a given period
"""

    prompt = f"""
You are a supply chain expert assistant answering questions using trusted internal documents and domain-specific reasoning.

Use the glossary below to interpret key terms. If unfamiliar acronyms or supply chain terminology appear in the context or question, you should infer their meaning as part of the supply chain or S&OP domain unless clearly stated otherwise.

Base your answer on the following context. If the answer is not directly stated, infer using best practices in supply chain and inventory management.

Context:
{context}

Question:
{question}

Your answer should:
- Be thoughtful and complete
- Base your answer on both context and best practices
- Use glossary definitions when available
- Assume any unknown terms are part of supply chain terminology unless clearly stated otherwise
- Clearly separate direct evidence from inferences

Answer:
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )

    return response.choices[0].message.content.strip()


In [19]:
question = "what are root causes of poor ATP accuracy?"

results = query_knowledgebase(question, k=15)
answer = answer_with_reasoning(question, results)
print(answer)


Direct Evidence:
The context provides several potential causes of poor ATP (Available to Promise) accuracy. These include:

1. Data errors: These can come from various sources such as numeric transpositions, typos, missing or incomplete data, older or not fully integrated databases with multiple versions of a record, and redundant databases in the network or different tags for the same objects. 

2. Delays in data collection: This can mean that the data arrive too late to be relevant.

3. Misidentification of product and units of measure: During annual inventories, misidentification often occurs because inexperienced counters assisting with the effort do not recognize items, misunderstand package descriptions, and so on.

4. Discrepancies “adjusted away”: If the reason for a discrepancy cannot be immediately found during the inventory, an adjustment is made with the underlying cause of the error never being corrected.

Inferences:
Based on best practices in supply chain and inventory m