In [None]:
#############################################################

# Copyright 2025 North Carolina State University

# Authored by
# Pranshav Gajjar, Abiodun Ganiyu, and Vijay K. Shah
# NextG Wireless Lab, North Carolina State University

############################################################# 

import os
import zipfile
from pathlib import Path
import fitz  # PyMuPDF for PDF processing
from docx import Document  # python-docx for Word processing
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document as LangchainDocument

# Unzip the docs
def unzip_docs(zip_file_path, extract_to='docs'):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Extract text from a PDF
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Extract text from a Word document
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Extract text from a Markdown file
def extract_text_from_md(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Process documents and extract text
def process_documents(directory):
    documents = []
    for file_path in Path(directory).rglob('*'):
        if file_path.suffix.lower() == '.pdf':
            text = extract_text_from_pdf(file_path)
        elif file_path.suffix.lower() == '.docx':
            text = extract_text_from_docx(file_path)
        elif file_path.suffix.lower() == '.md':
            text = extract_text_from_md(file_path)
        else:
            continue
        documents.append(LangchainDocument(page_content=text, metadata={"source": str(file_path)}))
    return documents

# Example usage
zip_file_path = 'docs.zip'

unzip_docs(zip_file_path)
docs = process_documents('docs')

In [None]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128,)# separators=["\n\n", "\n", "(?<=\. )", " ", ""])
chunked_docs = splitter.split_documents(docs)
len(chunked_docs)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load existing FAISS index
model_kwargs = {'device': 'cuda'}

# Use the new import path to avoid deprecation warning
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs=model_kwargs)

In [None]:
from tqdm import tqdm 
db = None
doc_counter = 0  # To track how many documents have been processed
batch_count = 0   # To keep track of the number of batches

with tqdm(total=len(chunked_docs), desc="Ingesting documents") as pbar:
    for d in chunked_docs:
        doc_counter += 1
        if db is not None:
            db.add_documents([d])  # Ensure d is passed as a list
        else:
            db = FAISS.from_documents([d], embeddings)  # Ensure d is passed as a list
        
        # Save the database and reset after every 10,000 documents
        if doc_counter % 25000 == 0:
            batch_count += 1  # Increment the batch count
            db.save_local(f'FAISS/final_bge_ORAN_batch_dc_{doc_counter}')  # Save with batch count
            db = None  # Reset db to None to create a new FAISS index for the next batch
        
        pbar.update(1)

# Save any remaining documents at the end of the loop
if db is not None:
    batch_count += 1  # Increment the batch count for the last save
    db.save_local(f'FAISS/final_bge_ORAN_batch_{batch_count}')

In [None]:
import os
from langchain.vectorstores import FAISS

# Path to the FAISS folder
faiss_folder = "FAISS"

# List all FAISS database folders inside FAISS directory
db_paths = [os.path.join(faiss_folder, d) for d in os.listdir(faiss_folder) if os.path.isdir(os.path.join(faiss_folder, d))]

# Ensure there are FAISS databases available
if not db_paths:
    raise ValueError("No FAISS databases found in the folder.")

# Load the first FAISS vector store
main_db = FAISS.load_local(db_paths[0], embeddings, allow_dangerous_deserialization=True)

# Merge all other vector stores into the first one
for path in db_paths[1:]:
    db_to_merge = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
    main_db.merge_from(db_to_merge)

In [None]:
# Save the merged vector store
main_db.save_local("merged_with_metadata")