In [None]:
# Cell 1: Import Libraries and Load API Keys
import os
import uuid
import json
import pandas as pd
import yaml
import ast
import re
from html.parser import HTMLParser
import time
import openai
import tiktoken
import pinecone
from tqdm import tqdm
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from collections import defaultdict

# Load .env file
load_dotenv()

# Initialize clients
openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# Configuration
OUTPUT_DIR = "/Users/lokesh/Desktop/Model_Earth/rag-pipeline"
INDEX_NAME = "model-earth-jam-stack"
INDEX_DIMENSION = 1536
PINECONE_REGION = "us-east-1"
MAX_LINES_PREVIEW = 5

TOKEN_LIMIT = 800

# Supported extensions
code_exts = {".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".cpp", ".c", ".cs", ".go", ".rb", ".php", ".rs", ".swift",".kt", ".kts", ".scala", ".cjs", ".mjs", ".ipynb", ".sh"}
markdown_exts = {".md", ".mdx", ".txt", ".rst", ".adoc"}
data_exts = {".csv", ".tsv", ".xls", ".xlsx", ".parquet", ".feather", ".pkl"}
json_exts = {".json", ".yaml", ".yml", ".jsonl", ".webmanifest"}
image_exts = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".psd", ".bmp", ".tiff", ".ico"}
font_exts = {".woff", ".woff2", ".ttf", ".otf", ".eot"}
binary_exts = {".map", ".zip", ".exe", ".bin", ".dll", ".so", ".o", ".gz"}
minified_exts = {".min.js", ".min.css", ".js.map", ".css.map"}
docs_exts = {".pdf", ".docx", ".doc", ".rtf", ".odt"}
html_exts = {".html", ".htm", ".xhtml"}
css_exts = {".css", ".scss", ".sass", ".less"}
xml_exts = {".xml", ".xsd", ".xsl"}

print("Libraries imported and API keys loaded.")

Libraries imported and API keys loaded.


In [None]:
# Cell 2: Define Repos and Verify Extensions
REPOS = {
    "ModelEarth/modelearth": "/Users/lokesh/Desktop/Model_Earth/modelearth"  # Update with actual path
}

def verify_extensions(repo_path):
    found_extensions = set()
    handled_extensions = code_exts | markdown_exts | data_exts | json_exts | image_exts | font_exts | binary_exts | minified_exts | docs_exts | html_exts | css_exts
    for dirpath, _, filenames in os.walk(repo_path):
        if ".git" in dirpath.split(os.sep):
            continue
        for filename in filenames:
            if filename == ".DS_Store":
                continue
            ext = os.path.splitext(filename)[-1].lower()
            found_extensions.add(ext)
    unhandled = found_extensions - handled_extensions
    print(f"Found extensions: {found_extensions}")
    if unhandled:
        print(f"Unhandled extensions: {unhandled}")
    for ext in found_extensions & handled_extensions:
        if ext in code_exts:
            print(f"{ext} -> Code chunking (Python/JS/TS)")
        elif ext in markdown_exts:
            print(f"{ext} -> Markdown chunking")
        elif ext in data_exts:
            print(f"{ext} -> Data preview chunking")
        elif ext in json_exts:
            print(f"{ext} -> JSON/YAML chunking")
        elif ext in image_exts:
            print(f"{ext} -> Image summary")
        elif ext in font_exts:
            print(f"{ext} -> Font summary")
        elif ext in binary_exts or ext in minified_exts:
            print(f"{ext} -> Binary/minified summary")
        elif ext in docs_exts:
            print(f"{ext} -> Document summary")
        elif ext in html_exts:
            print(f"{ext} -> HTML chunking")
        elif ext in css_exts:
            print(f"{ext} -> CSS chunking")

for repo_name, repo_path in REPOS.items():
    if not os.path.exists(repo_path):
        print(f"Directory {repo_path} for {repo_name} does not exist. Skipping.")
        continue
    print(f"\nVerifying extensions for {repo_name}:")
    verify_extensions(repo_path)


Verifying extensions for ModelEarth/localsite:
Found extensions: {'', '.html', '.csv', '.gif', '.md', '.map', '.txt', '.woff2', '.svg', '.woff', '.xls', '.jpg', '.mdx', '.json', '.py', '.png', '.js', '.psd', '.css'}
Unhandled extensions: {''}
.json -> JSON/YAML chunking
.png -> Image summary
.js -> Code chunking (Python/JS/TS)
.psd -> Image summary
.mdx -> Markdown chunking
.gif -> Image summary
.css -> CSS chunking
.jpg -> Image summary
.woff -> Font summary
.md -> Markdown chunking
.map -> Binary/minified summary
.txt -> Markdown chunking
.woff2 -> Font summary
.svg -> Image summary
.html -> HTML chunking
.py -> Code chunking (Python/JS/TS)
.csv -> Data preview chunking
.xls -> Data preview chunking


In [None]:
from pathlib import Path
from tree_sitter import Language, Parser
import uuid
import re
import json

# Tokenizer setup
MAX_TOKENS = 8192
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

def re_chunk_if_oversize(sections, max_tokens=MAX_TOKENS):
    final_chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        tokens = count_tokens(section)
        if tokens <= max_tokens:
            final_chunks.append(section)
        else:
            # Attempt semantic split first
            split_points = re.split(r'(?<=[.!?])\s+', section)
            current_chunk = ""
            for part in split_points:
                if count_tokens(current_chunk + " " + part) <= max_tokens:
                    current_chunk += " " + part
                else:
                    final_chunks.append(current_chunk.strip())
                    current_chunk = part
            if current_chunk.strip():
                final_chunks.append(current_chunk.strip())

            # Final hard split if still oversized
            really_final = []
            for chunk in final_chunks:
                if count_tokens(chunk) <= max_tokens:
                    really_final.append(chunk)
                else:
                    # Hard split on whitespace every ~3000 characters
                    raw_splits = re.findall(r'.{1,3000}(?:\s+|$)', chunk)
                    really_final.extend([s.strip() for s in raw_splits if s.strip()])
            final_chunks = really_final

    return final_chunks

# Supported extensions
code_exts = {".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".cpp", ".c", ".cs", ".go", ".rb", ".php", ".rs", ".swift",".kt", ".kts", ".scala", ".cjs", ".mjs", ".ipynb", ".sh"}
markdown_exts = {".md", ".mdx", ".txt", ".rst", ".adoc"}
data_exts = {".csv", ".tsv", ".xls", ".xlsx", ".parquet", ".feather", ".h5", ".hdf5"}
json_exts = {".json", ".yaml", ".yml", ".jsonl", ".webmanifest"}
image_exts = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".psd", ".bmp", ".tiff", ".ico"}
font_exts = {".woff", ".woff2", ".ttf", ".otf", ".eot"}
binary_exts = {".map", ".zip", ".exe", ".bin", ".dll", ".so", ".o", ".gz"}
minified_exts = {".min.js", ".min.css", ".js.map", ".css.map"}
docs_exts = {".pdf", ".docx", ".doc", ".rtf", ".odt"}
html_exts = {".html", ".htm", ".xhtml"}
css_exts = {".css", ".scss", ".sass", ".less"}
xml_exts = {".xml", ".xsd", ".xsl"}


# Tree-sitter language support
LANGUAGES = {
    '.py': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'python'),
    '.js': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'javascript'),
    '.jsx': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'javascript'),
    '.cjs': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'javascript'),
    '.ts': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'typescript'),
    '.java': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'java'),
    '.cpp': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'cpp'),
    '.c': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'c'),
    '.cs': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'c_sharp'),
    '.go': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'go'),
    '.rb': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'ruby'),
    '.php': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'php'),
    '.rs': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'rust'),
    '.swift': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'swift'),
    '.html': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'html'),
    '.css': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'css'),
    '.xml': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'xml'),
    '.scss': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'css'),
    '.sh': Language('/Users/lokesh/Desktop/Model_Earth/tree-sitter/build/my-languages.so', 'bash'),
    }


def generate_repo_structure(repo_path: Path, repo_name: str) -> dict:
    def build_compressed_tree(path: Path, base_path: Path) -> str:
        # Collect unique directory paths and assign IDs
        dir_map = {}
        dir_counter = 1
        structure = []
        
        for p in sorted(path.rglob("*")):
            relative_path = p.relative_to(base_path).as_posix()
            parent_path = str(p.parent.relative_to(base_path)) if p != base_path else ""
            parent_id = ""
            
            # Assign ID to parent directory if not already mapped
            if parent_path and parent_path not in dir_map:
                dir_map[parent_path] = f"d{dir_counter}"
                dir_counter += 1
            parent_id = dir_map.get(parent_path, "")
            
            if p.is_file():
                # File: use parent ID and file name with extension
                structure.append(f"{parent_id}:{p.name}|{p.suffix or ''}" if parent_id else f"{p.name}|{p.suffix or ''}")
            else:
                # Directory: only include if it has no children (leaf directory)
                if not any(q.is_dir() for q in p.iterdir()):
                    structure.append(f"{parent_id}:{p.name}:" if parent_id else f"{p.name}:")
        
        # Create ID map (e.g., "d1:.git:hooks,d2:input:industries")
        id_map = ",".join(f"{k}:{v.replace('/', ':')}" for v, k in dir_map.items())
        # Join structure with semicolons
        content = ";".join(structure)
        # Combine map and content
        return f"{id_map}|{content}" if id_map else content

    content = build_compressed_tree(repo_path, repo_path)
    return {
        "repo_name": repo_name,
        "file_path": f"{repo_name}/repo_structure",
        "file_type": "structure",
        "chunk_type": "repo_structure",
        "chunk_id": str(uuid.uuid4()),
        "embed": True,
        "content": content,
        "embedded": False,
        "line_range": "L1-L1"
    }

def chunk_code_tree_sitter(filepath, ext):
    parser = Parser()
    parser.set_language(LANGUAGES[ext])
    code = Path(filepath).read_text(encoding="utf-8")
    tree = parser.parse(code.encode("utf-8"))
    root = tree.root_node
    chunks = []
    for child in root.children:
        if child.type in ['function_definition', 'class_definition', 'function', 'method_definition', 'element', 'style_rule']:
            snippet = code[child.start_byte:child.end_byte]
            chunks.append(snippet.strip())
    return chunks if chunks else [code.strip()[:1000]]

def chunk_ipynb(filepath):
    try:
        with open(filepath, 'r', encoding="utf-8") as f:
            notebook = json.load(f)
        chunks = []
        for i, cell in enumerate(notebook.get('cells', [])):
            cell_type = cell.get('cell_type')
            source = ''.join(cell.get('source', [])).strip()
            if not source:
                continue
            if cell_type == 'markdown':
                chunks.append(f"[Markdown Cell {i+1}]\n{source}")
            elif cell_type == 'code':
                chunks.append(f"[Code Cell {i+1}]\n{source}")
            # Limit metadata to key info (e.g., kernel) to avoid bloat
            if i == 0 and 'metadata' in notebook:
                metadata = {k: v for k, v in notebook['metadata'].items() if k in ['kernelspec', 'language_info']}
                chunks.append(f"[Notebook Metadata]\n{json.dumps(metadata, indent=2)}")
        return re_chunk_if_oversize(chunks)
    except Exception as e:
        return [f"⚠️ Error parsing {filepath.name}: {e}"]

def chunk_html_tree_sitter_safe(filepath):
    code = Path(filepath).read_text(encoding="utf-8")
    parser = Parser()
    parser.set_language(LANGUAGES['.html'])
    tree = parser.parse(code.encode("utf-8"))
    root = tree.root_node
    chunks = []

    def collect_structural_nodes(node, code, max_tokens=MAX_TOKENS):
        if node.type in ['script_element', 'style_element']:
            return

        if node.type == 'element' and node.named_child_count > 0 and node.start_byte != node.end_byte:
            try:
                tag_snippet = code[node.start_byte:node.start_byte + code[node.start_byte:].find('>') + 1]
                tag_name = re.findall(r'<(\w+)', tag_snippet)[0]
            except Exception:
                tag_name = ""

            if tag_name in ['div', 'section', 'article', 'header', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']:
                snippet = code[node.start_byte:node.end_byte].strip()
                if snippet:
                    if count_tokens(snippet) <= max_tokens:
                        chunks.append(snippet)
                    else:
                        sub_chunks = re_chunk_if_oversize([snippet])
                        chunks.extend(sub_chunks)

        for child in node.children:
            collect_structural_nodes(child, code)

    # Traverse root tree
    for child in root.children:
        collect_structural_nodes(child, code)

    # Fallback in case nothing was collected
    if not chunks:
        fallback = code.strip()
        if count_tokens(fallback) <= MAX_TOKENS:
            chunks.append(fallback)
        else:
            chunks.extend(re_chunk_if_oversize([fallback]))

    # 🛡️ Final safety net
    safe_chunks = []
    for chunk in chunks:
        if count_tokens(chunk) <= MAX_TOKENS:
            safe_chunks.append(chunk)
        else:
            safe_chunks.extend(re_chunk_if_oversize([chunk]))

    return safe_chunks


def chunk_xml(filepath):
    code = Path(filepath).read_text(encoding="utf-8")
    parser = Parser()
    parser.set_language(LANGUAGES['.xml'])
    tree = parser.parse(code.encode("utf-8"))
    root = tree.root_node
    chunks = []

    def collect_structural_nodes(node, code, max_tokens=MAX_TOKENS):
        if node.type == 'element' and node.named_child_count > 0 and node.start_byte != node.end_byte:
            snippet = code[node.start_byte:node.end_byte].strip()
            if snippet and count_tokens(snippet) <= max_tokens:
                chunks.append(snippet)
            elif snippet:
                sub_chunks = re_chunk_if_oversize([snippet])
                chunks.extend(sub_chunks)
        for child in node.children:
            collect_structural_nodes(child, code)

    for child in root.children:
        collect_structural_nodes(child, code)
    
    if not chunks:
        chunks.append(code.strip()[:1000])
    
    return re_chunk_if_oversize(chunks)

def chunk_markdown(filepath):
    text = Path(filepath).read_text(encoding="utf-8")
    sections, current = [], []
    for line in text.splitlines():
        if line.startswith("#") and current:
            sections.append("\n".join(current).strip())
            current = []
        current.append(line)
    if current:
        sections.append("\n".join(current).strip())
    return sections

def chunk_json_yaml(filepath):
    try:
        with open(filepath, 'r', encoding="utf-8") as f:
            data = yaml.safe_load(f) if filepath.suffix in {'.yaml', '.yml'} else json.load(f)
        
        chunks = []

        def chunk_value(value, parent_key=None):
            text = json.dumps(value, indent=2)
            if count_tokens(text) <= MAX_TOKENS:
                chunks.append(text)
            else:
                # Try chunking dicts by keys
                if isinstance(value, dict):
                    for k, v in value.items():
                        chunk_value({k: v}, parent_key=k)
                elif isinstance(value, list):
                    for i, item in enumerate(value):
                        chunk_value(item, parent_key=f"{parent_key or 'list'}[{i}]")
                else:
                    # fallback for unbreakable types
                    chunks.append(text)

        chunk_value(data)
        return chunks
    except Exception as e:
        return [f"⚠️ Error parsing {filepath.name}: {e}"]



def chunk_csv_xls(filepath: Path):
    try:
        suffix = filepath.suffix.lower()
        if suffix in {'.xls', '.xlsx'}:
            df = pd.read_excel(filepath, nrows=5)
        elif suffix in {'.csv', '.tsv'}:
            df = pd.read_csv(filepath, sep=None, engine='python', nrows=5, encoding='utf-8', encoding_errors='ignore')
        elif suffix == '.parquet':
            df = pd.read_parquet(filepath)
            df = df.head(5)
        elif suffix == '.feather':
            df = pd.read_feather(filepath)
            df = df.head(5)
        elif suffix == '.pkl':
            df = pd.read_pickle(filepath)
            if not isinstance(df, pd.DataFrame):
                return [f"⚠️ Pickle file does not contain a DataFrame: {filepath.name}"]
            df = df.head(5)
        else:
            return [f"⚠️ Unsupported file format: {suffix}"]

        if df.empty:
            return [f"⚠️ Empty DataFrame in {filepath.name}"]

        columns = ", ".join(df.columns)
        preview = df.to_string(index=False)
        chunk = f"Columns: {columns}\nPreview (first 5 rows):\n{preview}"
        return [chunk] if chunk.strip() else [f"⚠️ Empty chunk for {filepath.name}"]

    except Exception as e:
        # Fallback to raw text read
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()[:5]
                chunk = f"Raw content (first 5 lines):\n{''.join(lines).strip()}"
                return [chunk] if chunk.strip() else [f"⚠️ Empty file content for {filepath.name}"]
        except Exception as e2:
            return [f"⚠️ Error reading {filepath.name}: {e2}"]


def chunk_as_summary(filepath: Path, root_folder: str = "Model_Earth") -> list[str]:
    """
    Return a summary chunk with a clean relative path starting from the project root folder.

    Example output:
    ['PDF file at Model_Earth/docs/report.pdf']
    """
    # Convert to absolute path and POSIX-style string
    full_path = filepath.resolve().as_posix()

    # Find the path from root_folder onward
    if root_folder in full_path:
        relative_path = full_path.split(root_folder, 1)[-1]
        clean_path = f"{root_folder}{relative_path}"
    else:
        clean_path = filepath.name  # fallback if root_folder not found

    # Format as summary
    return [f"{filepath.suffix.upper()[1:]} file at {clean_path}"]


def dispatch_chunking(file):
    ext = file.suffix.lower()
    try:
        if ext in code_exts:
            if ext == '.ipynb':
                chunks = chunk_ipynb(file)
                embed = True
            else:
                chunks = chunk_code_tree_sitter(file, ext)
                embed = True
        elif ext in html_exts:
            chunks = chunk_html_tree_sitter_safe(file)
            embed = True
        elif ext in css_exts:
            chunks = chunk_code_tree_sitter(file, ext)
            embed = True
        elif ext in xml_exts:
            chunks = chunk_xml(file)
            embed = True
        elif ext in markdown_exts:
            chunks = chunk_markdown(file)
            embed = True
        elif ext in json_exts:
            chunks = chunk_json_yaml(file)
            embed = True
        elif ext in data_exts:
            chunks = chunk_csv_xls(file)
            embed = True
        elif ext in image_exts | font_exts | binary_exts | minified_exts | docs_exts:
            chunks = chunk_as_summary(file)
            embed = True
        else:
            summary = chunk_as_summary(file)
            return [(chunk.strip(), True) for chunk in summary if chunk.strip()]


        chunks = re_chunk_if_oversize(chunks) if embed else chunks
        return [(chunk, embed) for chunk in chunks]
    except Exception as e:
        return [(f"⚠️ Error parsing {file.name}: {e}", False)]

def get_line_range(content, full_text):
    if not full_text or not content:  # For summaries or non-text files
        return "L1-L1"  # Treat summary as single-line
    try:
        start_idx = full_text.find(content)
        if start_idx == -1:
            return "L1-L1"  # Fallback for summaries or unmatched content
        before = full_text[:start_idx]
        start_line = before.count("\n") + 1
        line_count = content.count("\n") + 1
        end_line = start_line + line_count - 1
        return f"L{start_line}-L{end_line}"
    except:
        return "L1-L1"  # Fallback for any errors

def collect_all_valid_files(repo_path: Path, allowed_exts: set) -> list:
    return [file for file in repo_path.rglob("*") if file.is_file() and file.suffix.lower() in allowed_exts]

# Main execution
chunks_dict = defaultdict(list)

for repo_name, repo_path in REPOS.items():
    print(f"\n🔍 Chunking files for {repo_name}")
    path = Path(repo_path)
    valid_files = [file for file in path.rglob("*") if file.is_file()]
    repo_chunks = [generate_repo_structure(path, repo_name)]

    for file in valid_files:
        ext = file.suffix.lower()
        try:
            full_text = file.read_text(encoding="utf-8", errors="ignore") \
                        if ext not in (image_exts | font_exts | binary_exts | minified_exts | docs_exts) else ""
        except:
            full_text = ""
        chunk_data = dispatch_chunking(file)
        for i, (chunk, embed) in enumerate(chunk_data):
            # Skip only if chunk is not a string
            if not isinstance(chunk, str):
                print(f"⚠️ Skipping non-string chunk from {file.name}")
                continue

            chunk_entry = {
                "repo_name": repo_name,
                "file_path": f"{repo_name}/{file.relative_to(path)}",
                "file_type": ext.lstrip('.'),
                "chunk_type": "summary" if not embed else "content",
                "chunk_id": str(uuid.uuid4()),
                "embed": embed,
                "content": chunk.strip(),
                "embedded": False,
                "line_range": get_line_range(chunk, full_text)
            }
            repo_chunks.append(chunk_entry)

    output_path = Path(OUTPUT_DIR) / f"{repo_name.replace('/', '_')}_chunks.jsonl"
    with open(output_path, "w", encoding="utf-8") as f:
        for item in repo_chunks:
            json.dump(item, f)
            f.write("\n")

    chunks_dict[repo_name] = repo_chunks
    print(f"✅ Saved chunks to {output_path}")




🔍 Chunking files for ModelEarth/localsite
✅ Saved chunks to /Users/lokesh/Desktop/Model_Earth/rag-pipeline/ModelEarth_localsite_chunks.jsonl


In [4]:
# DEBUG: Count what’s actually going into repo_chunks
total_chunks = len(repo_chunks)
embeddable = sum(1 for c in repo_chunks if c["embed"])
non_embeddable = sum(1 for c in repo_chunks if not c["embed"])
empty_chunks = sum(1 for c in repo_chunks if not c["content"].strip())
print(f"📦 Final Chunk Stats for {repo_name}")
print(f"   ➤ Total chunks: {total_chunks}")
print(f"   ➤ Marked for embedding (embed=True): {embeddable}")
print(f"   ➤ Skipped from embedding (embed=False): {non_embeddable}")
print(f"   ➤ Empty content chunks (embed=True): {empty_chunks}")


📦 Final Chunk Stats for ModelEarth/localsite
   ➤ Total chunks: 940
   ➤ Marked for embedding (embed=True): 940
   ➤ Skipped from embedding (embed=False): 0
   ➤ Empty content chunks (embed=True): 0


In [5]:
# Final debug print to catch skipped chunks
skipped_chunks = [c for c in repo_chunks if not c["embed"]]
print(f"\n🚨 Skipped Chunks (embed=False): {len(skipped_chunks)}")
for sc in skipped_chunks:
    print(f"⛔ {sc['file_path']} | type={sc['file_type']} | content={sc['content'][:60]}")



🚨 Skipped Chunks (embed=False): 0


In [None]:
import tiktoken
from pathlib import Path
import json

# Setup tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")
MAX_TOKENS = 8192

# Directory where chunked files are saved
CHUNK_DIR = Path(OUTPUT_DIR)  # or replace with actual path

# Scan all jsonl files in output dir
for jsonl_file in CHUNK_DIR.glob("_chunks.jsonl"):
    print(f"\n🔍 Checking: {jsonl_file.name}")
    over_limit_chunks = []
    with open(jsonl_file, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            tokens = len(tokenizer.encode(obj["content"])) if obj.get("content") else 0
            if tokens > MAX_TOKENS:
                over_limit_chunks.append((obj["file_path"], obj["chunk_id"], tokens))

    if over_limit_chunks:
        print(f"❌ Found {len(over_limit_chunks)} chunks over {MAX_TOKENS} tokens:")
        for path, chunk_id, count in over_limit_chunks:
            print(f"  - {path} | Chunk ID: {chunk_id} | Tokens: {count}")
    else:
        print("✅ All chunks are within token limits.")



🔍 Checking: ModelEarth_localsite_chunks.jsonl
✅ All chunks are within token limits.


In [None]:

DIMENSION = 1536  # Dimension for text-embedding-3-small
METRIC = "cosine"
BATCH_SIZE = 10 

embeddings_dict = {}

for repo_name, chunks in chunks_dict.items():
    print(f"\n🔍 Embedding chunks for: {repo_name}")
    repo_embeddings = []
    
    for chunk in tqdm(chunks, desc=f"Embedding {repo_name}"):
        if chunk.get("embed", False) and not chunk.get("embedded", False):
            try:
                # Generate embedding
                response = openai_client.embeddings.create(
                    model="text-embedding-3-small",
                    input=chunk["content"]
                )
                vector = response.data[0].embedding

                # Mark as embedded and store embedding with content
                chunk["embedded"] = True
                repo_embeddings.append({
                    "id": chunk["chunk_id"],
                    "values": vector,
                    "metadata": {
                        "repo_name": chunk["repo_name"],
                        "file_path": chunk["file_path"],
                        "file_type": chunk["file_type"],
                        "chunk_type": chunk["chunk_type"],
                        "line_range": chunk["line_range"],
                        "content": chunk["content"]  # Include content for Pinecone
                    }
                })
            except Exception as e:
                print(f"⚠️ Failed to embed chunk {chunk['chunk_id']}: {e}")

    embeddings_dict[repo_name] = repo_embeddings
    print(f"✅ Embedded {len(repo_embeddings)} chunks for {repo_name}")


🔍 Embedding chunks for: ModelEarth/localsite


Embedding ModelEarth/localsite: 100%|██████████| 940/940 [03:17<00:00,  4.75it/s]

✅ Embedded 940 chunks for ModelEarth/localsite





In [9]:
total = len(chunks_dict["ModelEarth/localsite"])
to_embed = sum(1 for c in chunks_dict["ModelEarth/localsite"] if c.get("embed", False))
print(f"Total chunks: {total}, To embed: {to_embed}")


Total chunks: 940, To embed: 940


In [15]:
INDEX_NAME = "model-earth-jam-stack"
METRIC = "cosine"
DIMENSION = 1536 

# Create or access index
if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating Pinecone index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric=METRIC,
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to index
index = pc.Index(INDEX_NAME)

# Upsert embeddings per repository in batches
for repo_name, records in embeddings_dict.items():
    namespace = repo_name.replace("/", "_")  # Safe namespace
    print(f"\n🚀 Uploading embeddings for repo: {repo_name} (namespace: {namespace})")
    if not records:
        print(f"⚠️ No embeddings to upload for {repo_name}")
        continue

    total_uploaded = 0
    for i in range(0, len(records), BATCH_SIZE):
        batch = records[i:i + BATCH_SIZE]
        vectors = [
            {
                "id": record["id"],
                "values": record["values"],
                "metadata": {
                    "repo_name": record["metadata"].get("repo_name", ""),
                    "file_path": record["metadata"].get("file_path", ""),
                    "file_type": record["metadata"].get("file_type", ""),
                    "chunk_type": record["metadata"].get("chunk_type", "unknown"),
                    "line_range": record["metadata"].get("line_range", ""),
                    "content": record["metadata"].get("content", "")
                }
            }
            for record in batch
            if record.get("id") and record.get("values") and record["metadata"].get("content")
        ]

        if not vectors:
            print(f"⚠️ Skipping invalid batch at index {i} for {repo_name} (no valid embeddings)")
            continue

        # Check metadata size (Pinecone limit: 40KB)
        for vector in vectors:
            metadata_size = len(json.dumps(vector["metadata"]).encode("utf-8"))
            if metadata_size > 40000:
                print(f"⚠️ Truncating content for chunk {vector['id']} (size: {metadata_size} bytes)")
                vector["metadata"]["content"] = vector["metadata"]["content"][:10000]

        try:
            index.upsert(vectors=vectors, namespace=namespace)
            total_uploaded += len(vectors)
            print(f"✅ Uploaded batch of {len(vectors)} embeddings for {repo_name}")
        except Exception as e:
            print(f"❌ Batch failed at index {i} for {repo_name}: {e}")

    print(f"✅ Uploaded {total_uploaded} vectors for {repo_name} into namespace '{namespace}'")

Creating Pinecone index: model-earth-jam-stack

🚀 Uploading embeddings for repo: ModelEarth/localsite (namespace: ModelEarth_localsite)
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for ModelEarth/localsite
✅ Uploaded batch of 20 embeddings for 