In [4]:
import os
import faiss
import pickle
import subprocess
from pathlib import Path
from sentence_transformers import SentenceTransformer

# === CONFIG ===
TARGET_DIR = "./../../Jsm33t.com/"  # Change this
FILE_EXTENSIONS = {".cs"}
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
OUTPUT_FAISS = "doc_index.faiss"
OUTPUT_METADATA = "metadata.pkl"

# === INIT ===
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
texts = []
metadatas = []

# === HELPERS ===
def is_gitignored(path: Path) -> bool:
    try:
        subprocess.run(["git", "check-ignore", str(path)],
                       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return True
    except subprocess.CalledProcessError:
        return False

# === FILE WALK ===
for root, _, files in os.walk(TARGET_DIR):
    for file in files:
        path = Path(root) / file
        if path.suffix.lower() in FILE_EXTENSIONS and not is_gitignored(path):
            print(f"Indexing: {path}")
            try:
                content = path.read_text(encoding="utf-8", errors="ignore")
                if content.strip():
                    texts.append(content)
                    metadatas.append(str(path))
            except Exception as e:
                print(f"❌ Skipped {path} due to error: {e}")

# === EMBEDDING + SAVE ===
if texts:
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    faiss.write_index(index, OUTPUT_FAISS)
    with open(OUTPUT_METADATA, "wb") as f:
        pickle.dump(metadatas, f)

    print(f"\n✅ Indexed {len(texts)} files → Saved to {OUTPUT_FAISS} + {OUTPUT_METADATA}")
else:
    print("⚠️ No .cs files indexed.")


Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Program.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\AuthController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\BlogController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\ChangeLogController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\FcBaseController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\MediaCacheController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\ProfileController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Controllers\TestController.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Extensions\ServiceCollectionExtension.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Filters\DeductPointsAttribute.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Filters\DeductPointsFilter.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Middlewares\FcRequestMiddleware.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\Middlewares\RequestTimerMiddleware.cs
Indexing: ..\..\Jsm33t.com\Jsm33t.Api\obj\Debug\net9.0\.NETCoreApp,Version=v9.0.AssemblyAttr

Batches: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]


✅ Indexed 130 files → Saved to doc_index.faiss + metadata.pkl



