In [3]:
%pip install beautifulsoup4 lxml sentence-transformers faiss-cpu




In [4]:
import os
from bs4 import BeautifulSoup
doc_root = r"D:\thesis\PyPore3D_documentation\PyPore3D_documentation\functions"

### Extract text from html files

function for reading html

In [5]:
def extract_text_from_html(html_file):
    with open(html_file, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "lxml")
        # Extract visible text from the body tag
        return soup.get_text(separator="\n")

Function for traversing the files and reading the html docs

In [6]:
doc_chunks = []

# Traverse the folder and process files with .html extensions
for root, dirs, files in os.walk(doc_root):
    for file in files:
        print(f"Found file: {file}")
        # Check for .html extension
        if file.lower().endswith(".html"):  
            file_path = os.path.join(root, file)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    # Parse HTML using BeautifulSoup
                    soup = BeautifulSoup(f, "lxml")
                    
                    # Extract text from the body of the HTML file
                    body_text = soup.body.get_text(separator=" ", strip=True) if soup.body else ""
                    
                    # Ensure non-empty content is stored
                    if body_text:
                        doc_chunks.append(body_text)
            except Exception as e:
                print(f"Failed to process {file_path}: {e}")

# Output to confirm extraction
print(f"Number of processed HTML files: {len(doc_chunks)}")
if doc_chunks:
    print(f"Sample extracted text from first file: {doc_chunks[0][:200]}...")
else:
    print("No valid HTML files were processed.")

Found file: p3dBlobPy.py_p3dAnisotropyAnalysis.html
Found file: p3dBlobPy.py_p3dBasicAnalysis.html
Found file: p3dBlobPy.py_p3dBlobAnalysis.html
Found file: p3dBlobPy.py_p3dBlobLabeling.html
Found file: p3dBlobPy.py_p3dBlobLabeling_8.html
Found file: p3dBlobPy.py_p3dChamferDT.html
Found file: p3dBlobPy.py_p3dGetMaxVolumeBlob3D.html
Found file: p3dBlobPy.py_p3dGetMinVolumeBlob3D.html
Found file: p3dBlobPy.py_p3dMinVolumeFilter3D.html
Found file: p3dBlobPy.py_p3dMorphometricAnalysis.html
Found file: p3dBlobPy.py_p3dSquaredEuclideanDT.html
Found file: p3dBlobPy.py_p3dTextureAnalysis.html
Found file: p3dFiltPy.py_p3dAnisotropicDiffusionFilter16.html
Found file: p3dFiltPy.py_p3dAnisotropicDiffusionFilter8.html
Found file: p3dFiltPy.py_p3dAutoThresholding16.html
Found file: p3dFiltPy.py_p3dAutoThresholding8.html
Found file: p3dFiltPy.py_p3dBilateralFilter16.html
Found file: p3dFiltPy.py_p3dBilateralFilter8.html
Found file: p3dFiltPy.py_p3dBoinHaibelRingRemover8.html
Found file: p3dFiltPy.py_

### Data cleaning

Preprocessing function

In [7]:
def clean_text(text):
    lines = text.splitlines()
    # Remove empty lines and trim whitespace
    return "\n".join(line.strip() for line in lines if line.strip())

In [8]:
doc_chunks = [clean_text(chunk) for chunk in doc_chunks]
if len(doc_chunks) == 0:
    raise ValueError("No valid text extracted from the HTML files.")

### Faiss Indexing

Generating Embeddings

In [9]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(doc_chunks, convert_to_tensor=True)

In [11]:
print(f"Shape of embeddings: {embeddings.size()}")

Shape of embeddings: torch.Size([58, 384])


Creating and Populating the FAISS Index

In [12]:
import faiss
import numpy as np

In [13]:
print(f"Number of document chunks: {len(doc_chunks)}")
if len(doc_chunks) > 0:
    print(f"First document chunk: {doc_chunks[0]}")
else:
    print("No document chunks found! Check your HTML files.")

Number of document chunks: 58
First document chunk: PyPore3D p3dFiltPy p3dBlobPy p3dSkelPy p3dSITKPy p3dFiltPy_16 p3dSITKPy_16 p3dFiltPy.py_p3dReadRaw8 p3dFiltPy.py_p3dWriteRaw8 p3dFiltPy.py_p3dGaussianFilter8 p3dFiltPy.py_p3dMeanFilter8 p3dFiltPy.py_p3dMedianFilter8 p3dFiltPy.py_p3dAnisotropicDiffusionFilter8 p3dFiltPy.py_p3dBilateralFilter8 p3dFiltPy.py_p3dSijbersPostnovRingRemover8 p3dFiltPy.py_p3dClearBorderFilter8 p3dFiltPy.py_p3dAutoThresholding8 p3dFiltPy_16.py_p3dReadRaw16 p3dFiltPy_16.py_p3dWriteRaw16 p3dFiltPy_16.py_p3dAutoThresholding16 p3dFiltPy_16.py_p3dGaussianFilter16 p3dFiltPy_16.py_p3dMeanFilter16 p3dFiltPy_16.py_p3dMedianFilter16 p3dFiltPy_16.py_p3dAnisotropicDiffusionFilter16 p3dFiltPy_16.py_p3dBilateralFilter16 p3dFiltPy_16.py_p3dSijbersPostnovRingRemover16 p3dBlobPy.py_p3dBasicAnalysis p3dBlobPy.py_p3dAnisotropyAnalysis p3dBlobPy.py_p3dBlobLabeling p3dBlobPy.py_p3dGetMaxVolumeBlob3D p3dBlobPy.py_p3dGetMinVolumeBlob3D p3dBlobPy.py_p3dChamferDT p3dBlobPy.py_p3dMorpho

In [14]:
embeddings_np = embeddings.detach().cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

Search

In [15]:
results = ""
def search_documentation(query, index, chunks, embedder, top_k=3):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [chunks[i] for i in indices[0]]
    return results

### Codegemma

In [21]:
import requests
import json
url = "http://localhost:11434/api/generate"
payload = {
    "model": "codegemma",
    "prompt": results
}
response = requests.post(url, json=payload)
print("Status Code:", response.status_code)
    
raw_text = response.text
print("Raw Response Text:", raw_text)

Status Code: 200
Raw Response Text: {"model":"codegemma","created_at":"2024-12-30T02:00:31.9148334Z","response":"","done":true,"done_reason":"load"}


In [22]:
lines = raw_text.strip().split("\n")
words = []
for line in lines:
    try:
        json_obj = json.loads(line)
        words.append(json_obj.get("response", ""))
    except json.JSONDecodeError:
        print(f"Failed to parse line: {line}")

# Concatenate all words
result = "".join(words)

print("Concatenated Result:", result)

Concatenated Result: 


In [None]:
payload = {
    "model": "codegemma",
    "prompt": "can you explain to me what p3dFiltPy_16 is"
}
response = requests.post(url, json=payload)
raw_text = response.text
lines = raw_text.strip().split("\n")
words = []
for line in lines:
    try:
        json_obj = json.loads(line)
        words.append(json_obj.get("response", ""))
    except json.JSONDecodeError:
        print(f"Failed to parse line: {line}")

# Concatenate all words
result = "".join(words)

print("Concatenated Result:", result)

Concatenated Result: **p3dFiltPy_16** is a Python module developed by the **Princeton University Human Cognitive Neuroscience Lab (Princeton HCN Lab)**. It is specifically designed for **processing and analyzing fMRI data** acquired using the **fMRIprep software**.

**Purpose of p3dFiltPy_16:**

* Provides functions for preprocessing and filtering fMRI data, including:
    * Motion correction
    * Artifact removal
    * Spatial smoothing
* Facilitates the application of various statistical models to fMRI data, such as:
    * Linear mixed-effects models (LME)
    * General linear models (GLM)
* Enables the extraction of statistical estimates, including contrasts and effect sizes.

**Key Features:**

* Uses the **p3dfilt library**, which provides efficient implementations of common fMRI preprocessing and analysis techniques.
* Integrates with the **fMRIprep pipeline**, allowing for seamless data preprocessing within the pipeline framework.
* Offers a range of options for customization, 