In [1]:
!pip install pdfplumber pymupdf pytesseract pillow


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   - -------------------------------------- 0.3/5.6 MB ? eta -:--:--
   - -------------------------------------- 0.3/5.6 MB ? eta -:--:--
   --- ------------------------------------ 0.5/5.6 MB 597.5 kB/s eta 0:00:09
   --- ---------------

In [6]:
import os
import pdfplumber
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import pytesseract

# Tell pytesseract where the actual Tesseract program is installed
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# If Tesseract is not in PATH, uncomment and set the path manually
# Example (Windows default):
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Define input and output folders
input_dir = "../data/manuals"
output_dir = "../data/processed"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

print("Input folder:", input_dir)
print("Output folder:", output_dir)


Input folder: ../data/manuals
Output folder: ../data/processed


In [7]:
def extract_text_from_pdf(pdf_path, use_ocr=True, zoom=2):
    """Extracts text from a PDF using pdfplumber; falls back to OCR if needed."""
    outputs = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                page_text = page.extract_text() or ""
                if page_text.strip():
                    outputs.append(f"\n--- PAGE {i+1} ---\n")
                    outputs.append(page_text)
                else:
                    if use_ocr:
                        try:
                            doc = fitz.open(pdf_path)
                            p = doc.load_page(i)
                            mat = fitz.Matrix(zoom, zoom)
                            pix = p.get_pixmap(matrix=mat, alpha=False)
                            mode = "RGB" if pix.n < 4 else "RGBA"
                            img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
                            ocr_text = pytesseract.image_to_string(img, lang='eng')
                            outputs.append(f"\n--- PAGE {i+1} (OCR) ---\n")
                            outputs.append(ocr_text)
                            doc.close()
                        except Exception as e:
                            outputs.append(f"\n--- PAGE {i+1} (OCR FAILED) ---\n")
                            outputs.append(str(e))
                    else:
                        outputs.append(f"\n--- PAGE {i+1} (NO TEXT) ---\n")
    except Exception as e:
        outputs.append(f"[ERROR opening {pdf_path}: {e}]")

    return "\n".join(outputs)


In [8]:
sample_pdf = os.path.join(input_dir, "LG_Fridge_1.pdf")  # 👈 change this to match one of your files

text = extract_text_from_pdf(sample_pdf, use_ocr=True)

print("First 800 characters:\n")
print(text[:800])


First 800 characters:


--- PAGE 1 (OCR) ---

TONVdS3 | HSINDN3

OWNER’'S MANUAL
BOTTOM FREEZE
REFRIGERATOR

Please read this manual carefully before operating
your set and retain it for future reference.

MANUAL DE USUARIO
REFRIGERADOR
CON CONGELADOR INFERIOR

Lea detenidamente este manual antes de empezar
a utilizar el refrigerador y guardelo como referencia
para el futuro.

Model Name/Nombre de Modelo * =color number/numero de color

LFXS24623*
GM63SGS

P/No. MFL67227304-1 www.lg.com


--- PAGE 2 (OCR) ---



--- PAGE 3 (OCR) ---

TABLE OF CONTENTS

English Version
Spanish Version

Important Safety Instructions
Requirements for Ground Connection

Parts and Features

Refrigerator Installation
Unpacking
Installation
How to remove and install Handle
How to remove and Install the
Refrigerator Doors
How to remove 


In [10]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

pdf_path = r"E:\Companion-AI\data\manuals\LG_Fridge_1.pdf"   # your input PDF
output_txt = r"E:\Companion-AI\data\extracted_texts\manual_ocr.txt"  # your output file

doc = fitz.open(pdf_path)

all_text = ""  # collect OCR text from all pages

for page_num in range(len(doc)):
    page = doc[page_num]
    pix = page.get_pixmap()  # render page as image
    img = Image.open(io.BytesIO(pix.tobytes("png")))

    text = pytesseract.image_to_string(img)
    all_text += f"\n--- PAGE {page_num+1} (OCR) ---\n\n{text}\n"

# Save everything into a single .txt file
with open(output_txt, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"OCR text saved to {output_txt}")


OCR text saved to E:\Companion-AI\data\extracted_texts\manual_ocr.txt


In [11]:
import os
import fitz  # PyMuPDF
import pytesseract
from PIL import Image

# Set Tesseract path (update this if needed)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Define input (manual PDFs) and output (extracted text) folders
input_folder = r"E:\Companion-AI\data\manuals"
output_folder = r"E:\Companion-AI\data\extracted_texts"

# Create output folder if it doesn’t exist
os.makedirs(output_folder, exist_ok=True)

# Loop through all PDFs in the manuals folder
for filename in os.listdir(input_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(input_folder, filename)
        output_file = os.path.splitext(filename)[0] + ".txt"
        output_path = os.path.join(output_folder, output_file)

        print(f"Processing: {filename} → {output_file}")

        doc = fitz.open(pdf_path)
        full_text = ""

        for page_num in range(len(doc)):
            page = doc[page_num]

            # Try extracting text directly
            text = page.get_text()
            if text.strip():
                full_text += f"\n--- PAGE {page_num+1} (TEXT) ---\n{text}\n"
            else:
                # Fall back to OCR
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                ocr_text = pytesseract.image_to_string(img)
                full_text += f"\n--- PAGE {page_num+1} (OCR) ---\n{ocr_text}\n"

        # Save extracted text
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(full_text)

print("✅ All manuals converted to text and saved in:", output_folder)


Processing: LG_Fridge_1.pdf → LG_Fridge_1.txt
Processing: LG_Fridge_2.pdf → LG_Fridge_2.txt
Processing: LG_Fridge_3.pdf → LG_Fridge_3.txt
Processing: LG_WM_1.pdf → LG_WM_1.txt
Processing: LG_WM_2.pdf → LG_WM_2.txt
Processing: LG_WM_3.pdf → LG_WM_3.txt
Processing: Sam_Fridge_1.pdf → Sam_Fridge_1.txt
Processing: Sam_Fridge_2.pdf → Sam_Fridge_2.txt
Processing: Sam_Fridge_3.pdf → Sam_Fridge_3.txt
Processing: Sam_WM_1.pdf → Sam_WM_1.txt
Processing: Sam_WM_2.pdf → Sam_WM_2.txt
Processing: Sam_WM_3.pdf → Sam_WM_3.txt
✅ All manuals converted to text and saved in: E:\Companion-AI\data\extracted_texts


In [12]:
import os
import json

# -----------------------------
# SETTINGS
# -----------------------------
INPUT_FOLDER = r"E:\Companion-AI\data\extracted_texts"   # Where your .txt files are
OUTPUT_FILE = r"E:\Companion-AI\data\chunks\chunks.json" # JSON output path

CHUNK_SIZE = 500   # Number of words per chunk
OVERLAP = 50       # Number of words to overlap between chunks
# -----------------------------

def read_text_file(file_path):
    """Read a text file and return its content as a string."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    """
    Split text into chunks with specified chunk size and overlap.
    Returns a list of strings (chunks).
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words)
        chunks.append(chunk_text)
        start = end - overlap  # move start with overlap
        if start < 0:
            start = 0
    return chunks

# Make sure output folder exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# Store all chunks here
all_chunks = []

# Iterate over all text files
for file_name in os.listdir(INPUT_FOLDER):
    if file_name.endswith(".txt"):
        file_path = os.path.join(INPUT_FOLDER, file_name)
        text = read_text_file(file_path)
        chunks = chunk_text(text)

        # Add metadata for each chunk
        for i, chunk in enumerate(chunks, 1):
            all_chunks.append({
                "file_name": file_name,
                "chunk_id": i,
                "text": chunk
            })

# Save all chunks to JSON
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=4)

print(f"✅ Total {len(all_chunks)} chunks saved to {OUTPUT_FILE}")


✅ Total 422 chunks saved to E:\Companion-AI\data\chunks\chunks.json
