In [1]:
from pathlib import Path
import numpy as np
from tqdm import tqdm
from embed import get_chunks, get_embeddings
from config import GEMINI_API_KEY
from extract_text import extract_text_from_markdown
import asyncio
import signal
import json, sys, os

In [28]:
files = [*Path("raw-data/Markdown-data").glob("*.md")]
all_chunks = []
all_embeddings = []
all_original_urls = []
total_chunks = 0
file_chunks = {}
file_urls = {}

existing_count = 0
if os.path.exists("emergency_save_me.json"):
    print("📂 Loading from emergency_save_me.json...")
    with open("emergency_save_me.json", "r", encoding="utf-8") as f:  # Fixed filename
        saved_data = json.load(f)
        all_chunks_loaded = saved_data["chunks"]
        all_embeddings_loaded = saved_data["embeddings"] 
        all_original_urls_loaded = saved_data["original_urls"]
        
        # Count only valid (non-null) embeddings
        valid_count = 0
        for i, embedding in enumerate(all_embeddings_loaded):
            if embedding[0] and len(str(embedding[0]).strip()) > 1:  # Simplified validation
                valid_count += 1
            else:
                break
        
        # Keep only valid entries
        all_chunks = all_chunks_loaded[:valid_count]
        all_embeddings = all_embeddings_loaded[:valid_count]
        all_original_urls = all_original_urls_loaded[:valid_count]
        existing_count = valid_count - 2

        print(f"✅ Loaded {existing_count} valid chunks (filtered out null embeddings)")

# First pass: extract content and count chunks
for file_path in files:
    content, original_url = extract_text_from_markdown(file_path)
    chunks = get_chunks(content)
    file_chunks[file_path] = chunks
    file_urls[file_path] = original_url
    total_chunks += len(chunks)
    print(f"File: {file_path.name}, Chunks: {len(chunks)}")

print(f"Total chunks created: {total_chunks}")

# **FIXED**: Calculate which file and chunk to start from
chunks_to_skip = existing_count
start_file_index = 0
start_chunk_index = 0

# Find the starting file and chunk position
file_list = list(file_chunks.items())
for i, (file_path, chunks) in enumerate(file_list):
    if chunks_to_skip >= len(chunks):
        chunks_to_skip -= len(chunks)
    else:
        start_file_index = i
        start_chunk_index = chunks_to_skip
        break

processed_count = existing_count
# Second pass: process chunks starting from the correct position
with tqdm(total=total_chunks, initial=existing_count, desc="Processing Chunks") as pbar:
    for i in range(start_file_index, len(file_list)):
        file_path, chunks = file_list[i]
        original_url = file_urls[file_path]
        
        # Start from the correct chunk index for the first file, 0 for others
        chunk_start = start_chunk_index if i == start_file_index else 0
        
        for j in range(chunk_start, len(chunks)):
            chunk = chunks[j]
            try:
                embedding = await get_embeddings(chunk, api_key=GEMINI_API_KEY)
                all_chunks.append([chunk])  # Simplified - removed unnecessary list wrapping
                all_embeddings.append([embedding])  # Simplified
                all_original_urls.append([original_url])  # Simplified
                processed_count += 1
                pbar.set_postfix({"file": file_path.name, "chunk": processed_count})
                print(f'{chunk} is created of url {original_url} and first few embeddings are: {embedding[:6]}')
            except Exception as e:
                print(f"Error processing chunk in {file_path}: {e}")
            finally:
                pbar.update(1)
        
        break


📂 Loading from emergency_save_me.json...
✅ Loaded 66 valid chunks (filtered out null embeddings)
File: 1._Development_Tools.md, Chunks: 1
File: 2._Deployment_Tools.md, Chunks: 1
File: 3._Large_Language_Models.md, Chunks: 1
File: 4._Data_Sourcing.md, Chunks: 1
File: 5._Data_Preparation.md, Chunks: 1
File: 6._Data_Analysis.md, Chunks: 1
File: 7._Data_Visualization.md, Chunks: 1
File: Actor_Network_Visualization.md, Chunks: 1
File: AI_Code_Editors__GitHub_Copilot.md, Chunks: 1
File: AI_Terminal_Tools__llm.md, Chunks: 1
File: Authentication__Google_Auth.md, Chunks: 1
File: Base_64_Encoding.md, Chunks: 1
File: BBC_Weather_API_with_Python.md, Chunks: 1
File: Browser__DevTools.md, Chunks: 1
File: CI_CD__GitHub_Actions.md, Chunks: 1
File: Cleaning_Data_with_OpenRefine.md, Chunks: 1
File: Containers__Docker,_Podman.md, Chunks: 1
File: Convert_HTML_to_Markdown.md, Chunks: 2
File: Convert_PDFs_to_Markdown.md, Chunks: 1
File: Correlation_with_Excel.md, Chunks: 1
File: CORS.md, Chunks: 1
File: Craw

Processing Chunks:  63%|██████▎   | 67/106 [00:01<01:05,  1.67s/it, file=Parsing_JSON.md, chunk=67]

Parsing JSON JSON is everywhere—APIs, logs, configuration files—and its nested or large structure can challenge memory and processing. In this tutorial, we’ll explore tools to flatten, stream, and query JSON data efficiently. For example, we’ll often need to process a multi-gigabyte log file from a web service where each record is a JSON object. This requires us to handle complex nested structures, large files that don’t fit in memory, or extract specific fields. Here are the key tools and techniques for efficient JSON parsing: | Tool | Extract from JSON… | Why | | --- | --- | --- | | jq | JSON in the shell | Quick data exploration and pipeline processing | | JMESPath | JSON in Python | Handle complex queries with a clean syntax | | ijson | JSON streams in Python | Parse streaming/large JSON files memory-efficiently | | Pandas | JSON columns in Python | Fast analysis of structured data | | SQL JSON | JSON in databases | Combine structured and semi-structured data | | DuckDB | JSON anyw




In [29]:
all_chunks[68:]

[['Parsing JSON JSON is everywhere—APIs, logs, configuration files—and its nested or large structure can challenge memory and processing. In this tutorial, we’ll explore tools to flatten, stream, and query JSON data efficiently. For example, we’ll often need to process a multi-gigabyte log file from a web service where each record is a JSON object. This requires us to handle complex nested structures, large files that don’t fit in memory, or extract specific fields. Here are the key tools and techniques for efficient JSON parsing: | Tool | Extract from JSON… | Why | | --- | --- | --- | | jq | JSON in the shell | Quick data exploration and pipeline processing | | JMESPath | JSON in Python | Handle complex queries with a clean syntax | | ijson | JSON streams in Python | Parse streaming/large JSON files memory-efficiently | | Pandas | JSON columns in Python | Fast analysis of structured data | | SQL JSON | JSON in databases | Combine structured and semi-structured data | | DuckDB | JSON a

In [36]:
all_embeddings[67]

[[0.015401099,
  -0.0040160925,
  0.027933642,
  -0.057622503,
  -0.021423928,
  -0.0055042217,
  -0.013916955,
  -0.0045992546,
  -0.014560878,
  -0.015298808,
  -0.0037802518,
  -0.027983941,
  0.011000204,
  0.0071733277,
  0.13441342,
  0.0024995718,
  -0.007537131,
  -0.044087704,
  0.036963254,
  0.020606656,
  0.0068724058,
  0.012738272,
  -0.019748084,
  -0.01536124,
  -0.017556025,
  -0.04765665,
  0.0068945927,
  -0.015613508,
  0.034896612,
  -0.007144758,
  0.005604778,
  0.005994941,
  0.008156497,
  0.0014812168,
  0.0039132894,
  0.023994433,
  -0.016899893,
  -0.008875052,
  -0.014657001,
  0.03294927,
  -0.010956032,
  -0.0015527925,
  -0.0008555846,
  0.01326794,
  0.008604556,
  -0.009170425,
  0.017281914,
  0.00483937,
  0.014017098,
  0.013361093,
  -0.023404352,
  0.024613746,
  -0.04236456,
  -0.18889955,
  0.023949085,
  -0.0021708778,
  -0.028024783,
  0.0007359393,
  0.006350232,
  -0.004601956,
  -0.0014595371,
  0.016798751,
  -0.038871914,
  -0.03549049,


In [31]:
data_safe = {
    "chunks": all_chunks,
    "embeddings": all_embeddings,
    "original_urls": all_original_urls,
}

with open("markdown_embeddings_experi.json", "w", encoding="utf-8") as f:
    json.dump(data_safe, f, indent=2, ensure_ascii=False)

In [32]:
print(data_safe['chunks'][67])
print(data_safe['chunks'][68])

['Parsing JSON JSON is everywhere—APIs, logs, configuration files—and its nested or large structure can challenge memory and processing. In this tutorial, we’ll explore tools to flatten, stream, and query JSON data efficiently. For example, we’ll often need to process a multi-gigabyte log file from a web service where each record is a JSON object. This requires us to handle complex nested structures, large files that don’t fit in memory, or extract specific fields. Here are the key tools and techniques for efficient JSON parsing: | Tool | Extract from JSON… | Why | | --- | --- | --- | | jq | JSON in the shell | Quick data exploration and pipeline processing | | JMESPath | JSON in Python | Handle complex queries with a clean syntax | | ijson | JSON streams in Python | Parse streaming/large JSON files memory-efficiently | | Pandas | JSON columns in Python | Fast analysis of structured data | | SQL JSON | JSON in databases | Combine structured and semi-structured data | | DuckDB | JSON an

In [33]:
with open("markdown_embeddings_experi.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [37]:
data['embeddings'][68][0]

[0.002378232,
 -0.013437408,
 0.006254516,
 -0.07474148,
 0.018676104,
 -0.022904221,
 0.01497215,
 0.022852534,
 -0.0068420973,
 -0.013070647,
 -0.009434912,
 -0.011786722,
 0.016732808,
 0.0034034555,
 0.11670286,
 -0.009543681,
 0.013881774,
 0.030388309,
 0.01675909,
 -0.027571758,
 0.008363286,
 -0.005643132,
 -0.016398478,
 -0.02343412,
 0.0024755206,
 -0.0250002,
 -0.01049141,
 -0.02493171,
 0.0483544,
 0.008872562,
 -0.007865693,
 0.020536223,
 0.024082009,
 -0.023288196,
 -0.008363765,
 -0.0062553408,
 -0.0019811345,
 0.0054711765,
 -0.024459938,
 0.015772156,
 -0.017085465,
 0.0010674937,
 -0.020650227,
 0.0051416154,
 -0.00090093387,
 0.0063968087,
 -0.007417439,
 -0.004878619,
 -0.038087673,
 0.0041139172,
 0.016603017,
 -0.0050578034,
 -0.011928905,
 -0.18016036,
 0.0078053237,
 -0.006036257,
 -0.0155466115,
 -0.01874758,
 0.038391758,
 -0.0053273244,
 -0.033141565,
 0.016999783,
 -0.040928733,
 -0.011410794,
 0.015145713,
 -0.014245424,
 0.011699482,
 0.0010080192,
 -0.00

In [None]:
# for i in range(len(data['embeddings'])):
#     # print(f"Chunk {i+1}:")
#     # print(f"Content: {chunks[i]}")
#     # print(f"Embedding: {embeddings[i][0]}")
#     # print(data['embeddings'][i][0])
#     print(type(data['embeddings'][i][0]))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'li