In [1]:
import pandas as pd
import json

In [2]:
with open("backend/data/market_cache_2.json", "r") as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame(data.values(), index=data.keys())

In [4]:
df.drop(columns = ["news"], inplace = True)

In [5]:
df['ticker'] = df.index

In [6]:
df.reset_index(drop = True, inplace = True)

In [7]:
sec_df = pd.DataFrame()

for index, row in df.iterrows():
    filling_data = pd.DataFrame(row["filings"])
    filling_data["ticker"] = row["ticker"]
    filling_data["timestamp"] = row["timestamp"]
    sec_df = pd.concat([sec_df, filling_data], axis = 0)

In [8]:
sec_df['Markdown_content']  = sec_df['content'].apply(lambda x: x.get('markdown', None))

In [9]:
sec_df.columns

Index(['ticker', 'timestamp', 'type', 'title', 'date', 'url',
       'accession_number', 'scraped', 'content', 'Markdown_content'],
      dtype='object')

### Semantic Markdown based Chunking

In [10]:
import re

import re

def semantic_chunk_markdown(md_text: str, max_chunk_size: int = 3000, overlap: int = 300) -> list[str]:
    """
    Splits markdown text based on section headers while respecting max chunk size and overlap.
    """
    if not md_text or not isinstance(md_text, str):
        return []
    
    sections = re.split(r'(?=^#+\s)', md_text, flags=re.MULTILINE)
    chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # If the section fits in one chunk, just add it
        if len(section) <= max_chunk_size:
            chunks.append(section)
            continue

        # If too long, break it into overlapping chunks
        start = 0
        while start < len(section):
            end = min(start + max_chunk_size, len(section))
            chunk = section[start:end]
            chunks.append(chunk.strip())
            # Step forward with overlap
            start += max_chunk_size - overlap

    return chunks



In [11]:
sec_df["Chunks"] = sec_df["Markdown_content"].apply(semantic_chunk_markdown)

In [12]:
sec_df['Chunk_Length'] = sec_df['Chunks'].apply(lambda x: len(x))

In [13]:
sec_df['Content_Length'] = sec_df['Markdown_content'].apply(lambda x: len(x.split(" ")) if isinstance(x, str) else 0)

In [14]:
sec_df['Cotent_Length_char'] = sec_df['Markdown_content'].apply(lambda x: len(x) if isinstance(x, str) else 0)

In [15]:
sec_df_exploded = sec_df.explode("Chunks").reset_index(drop = True)

In [16]:
sec_df['type'].unique()

array(['4', '144', 'NPORT-P', 'N-30D', 'N-CEN', '8-K', 'SCHEDULE 13G/A',
       '13F-HR', '3', '424B2', 'SD', 'DEFA14A', '10-Q', 'PX14A6G',
       'SCHEDULE 13G', '3/A', '10-Q/A', '10-K/A'], dtype=object)

In [17]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

# Initialize client (use your URL / API key)
qdrant = QdrantClient(url="http://localhost:6333")

# Define schema
vector_size = 384  # depends on embedding model used
collection_name = "sec_filings"

qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=vector_size,
        distance=models.Distance.COSINE
    ),
    optimizers_config=models.OptimizersConfigDiff(
        indexing_threshold=20000  # helps with larger datasets
    ),
    on_disk_payload=True  # good for large metadata
)


  qdrant.recreate_collection(


True

In [18]:
qdrant.scroll(collection_name="sec_filings", limit=1)

([], None)

In [19]:
sec_df_exploded.dropna(inplace = True)

In [20]:
sec_df_exploded.shape

(1578, 14)

In [21]:
from backend.utils.retrieval_utils import batch_encode

# gemini_client = GeminiClient(api_key = "AIzaSyDXyPiC6yxDhy9CO6EkfAYUIV-mKJ8V1OA")

# sec_df['Embedding'] = sec_df_exploded['Chunks'].apply(lambda x: gemini_client.get_gemini_embedding(x))

chunks = sec_df_exploded['Chunks'].tolist()

embeddings = batch_encode(chunks)
sec_df_exploded['Embedding'] = embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
from qdrant_client.http import models as qmodels
import uuid

def insert_filing_chunks(client, df, collection_name):
    """
    Insert chunk embeddings + metadata into Qdrant.
    Args:
        df: DataFrame with chunked text and metadata.
        embeddings: list of embeddings aligned with df['Chunks']
    """
    points = []

    for i, row in df.iterrows():
        points.append(
            qmodels.PointStruct(
                id=str(uuid.uuid4()),
                vector=row["Embedding"],
                payload={
                    "ticker": row["ticker"],
                    "filing_type": row["type"],
                    "filing_title": row["title"],
                    "accession_number": row["accession_number"],
                    "filing_date": row["date"],
                    "url": row["url"],
                    "chunk_length": row["Chunk_Length"],
                    "content_length": row["Content_Length"],
                    "chunk_index": i,
                    "text": row["Chunks"]  # useful for hybrid search
                }
            )
        )

    client.upsert(
        collection_name=collection_name,
        wait=True,
        points=points
    )

    print(f"✅ Inserted {len(points)} chunks into Qdrant collection '{collection_name}'")


In [23]:
insert_filing_chunks(qdrant, sec_df_exploded, collection_name)

✅ Inserted 1578 chunks into Qdrant collection 'sec_filings'


In [23]:
sec_df_exploded.to_excel("Preprocessed_SEC_fillings_data.xlsx", index = False)