In [1]:
import os
import time
import requests
# from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
from pymongo import MongoClient
from langchain_experimental.text_splitter import SemanticChunker
import requests

In [2]:
symbol = "AAPL"
date_from = "2024-04-16"
date_to = "2025-04-16"
fmp_api_key = "b6adf265209f12e18fd61e2f403585c3"

In [3]:
url = f"https://financialmodelingprep.com/stable/sec-filings-search/symbol?symbol={symbol}&from={date_from}&to={date_to}&apikey={fmp_api_key}"
response = requests.get(url)
data = response.json()

In [4]:
data

[{'symbol': 'AAPL',
  'cik': '0000320193',
  'filingDate': '2025-04-04 00:00:00',
  'acceptedDate': '2025-04-04 16:30:11',
  'formType': 'SD',
  'link': 'https://www.sec.gov/Archives/edgar/data/320193/000114036125012359/0001140361-25-012359-index.htm',
  'finalLink': 'https://www.sec.gov/Archives/edgar/data/320193/000114036125012359/ef20046400_sd.htm'},
 {'symbol': 'AAPL',
  'cik': '0000320193',
  'filingDate': '2025-04-03 00:00:00',
  'acceptedDate': '2025-04-03 18:33:37',
  'formType': '4',
  'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019325000045/0000320193-25-000045-index.htm',
  'finalLink': 'https://www.sec.gov/Archives/edgar/data/320193/000032019325000045/xslF345X05/wk-form4_1743719610.xml'},
 {'symbol': 'AAPL',
  'cik': '0000320193',
  'filingDate': '2025-04-03 00:00:00',
  'acceptedDate': '2025-04-03 18:32:58',
  'formType': '4',
  'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019325000044/0000320193-25-000044-index.htm',
  'finalLink': 'http

In [None]:
from pymongo import MongoClient
from langchain.embeddings import OpenAIEmbeddings
import os
import re
import numpy as np
from datetime import datetime
from helper import (
    processing_html2txt,
    combine_sentences,
    calculate_cosine_distances,
    find_appropriate_threshold,
)
from pdf_to_gcp import HtmlToPdfGcpUploader
from sec_downloader import Downloader

# MongoDB setup
mongo_client = MongoClient(os.getenv("MONGO_URI"))
db = mongo_client["qualitative"]
collection = db["fillings"]

# Embedding model (OpenAI)
oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
dl = Downloader("Traderware", "x.tan@traderverse.io")

def process_and_insert_to_mongodb(file):
    # Step 1: Fetch and clean HTML
    try:
        html_content = dl.download_filing(url=file["finalLink"]).decode()
    except:
        html_content = HtmlToPdfGcpUploader().download_using_request(file["finalLink"])

    raw_essay = processing_html2txt(html_content)
    sentence_texts = re.split(r"(?<=[.#:])\s+", raw_essay)
    sentences = [{"sentence": s, "index": i} for i, s in enumerate(sentence_texts)]
    sentences = combine_sentences(sentences)

    # Step 2: Embed each sentence (OpenAI)
    embeddings = oaiembeds.embed_documents(
        [x["combined_sentence"] for x in sentences]
    )
    for i, emb in enumerate(embeddings):
        sentences[i]["combined_sentence_embedding"] = emb

    # Step 3: Semantic chunking
    distances, sentences = calculate_cosine_distances(sentences)
    
    threshold, chunks, chunk_sizes = find_appropriate_threshold(sentences, distances, 95, 1000)
    breakpoint_distance_threshold = np.percentile(distances, threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

    # Step 4: Group sentences into chunks
    chunk_texts = []
    start_index = 0
    for index in indices_above_thresh:
        group = sentences[start_index : index + 1]
        chunk_texts.append(" ".join([d["sentence"] for d in group]))
        start_index = index + 1
    if start_index < len(sentences):
        chunk_texts.append(" ".join([d["sentence"] for d in sentences[start_index:]]))

    # Step 5: Embed chunks with OpenAI
    chunk_embeddings = oaiembeds.embed_documents(chunk_texts)

    # Step 6: Insert into MongoDB
    safe_date = file["filingDate"].split(" ")[0]
    for chunk, vector in zip(chunk_texts, chunk_embeddings):
        doc = {
            "content": chunk,
            "embedding": vector,
            "file_name": 
                f"{file['symbol']}_{file['formType']}_{safe_date}",
            "ticker": file["symbol"],
            "filling_type": file["formType"],
            "date": datetime.fromisoformat(safe_date)
        }
        collection.insert_one(doc)

    print(f"✅ Inserted {len(chunk_texts)} chunks using OpenAI embeddings for: {file['symbol']}")


In [73]:
def batch_process(data, batch_size=100):
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        try:
            for file in batch:
                if file["formType"] in {"8-K", "10-K", "10-Q"}:
                    print("Processing file: ",file['formType']," ",file['filingDate'])
                    process_and_insert_to_mongodb(file)
        except Exception as e:
            print(f"Error processing batch {i}–{i+batch_size}: {e}")

batch_process(data, batch_size=100)


Processing file:  8-K   2025-02-25 00:00:00


  self._process_recursively(elements, _context=context)


✅ Inserted 3 chunks using OpenAI embeddings for: AAPL
Processing file:  10-Q   2025-01-31 00:00:00
✅ Inserted 29 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2025-01-30 00:00:00
✅ Inserted 4 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2025-01-03 00:00:00


  inner_elements = self._process_recursively(


✅ Inserted 3 chunks using OpenAI embeddings for: AAPL
Processing file:  10-K   2024-11-01 00:00:00


  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


✅ Inserted 206 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-10-31 00:00:00
✅ Inserted 4 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-09-10 00:00:00


  return self._process(elements)


✅ Inserted 3 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-08-26 00:00:00


  return self._process(elements)


✅ Inserted 2 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-08-23 00:00:00
✅ Inserted 493 chunks using OpenAI embeddings for: AAPL
Processing file:  10-Q   2024-08-02 00:00:00
✅ Inserted 35 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-08-01 00:00:00
✅ Inserted 3 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-05-03 00:00:00
✅ Inserted 89 chunks using OpenAI embeddings for: AAPL
Processing file:  10-Q   2024-05-03 00:00:00
✅ Inserted 27 chunks using OpenAI embeddings for: AAPL
Processing file:  8-K   2024-05-02 00:00:00
✅ Inserted 4 chunks using OpenAI embeddings for: AAPL


In [5]:
data

[{'symbol': 'AAPL',
  'cik': '0000320193',
  'filingDate': '2025-04-04 00:00:00',
  'acceptedDate': '2025-04-04 16:30:11',
  'formType': 'SD',
  'link': 'https://www.sec.gov/Archives/edgar/data/320193/000114036125012359/0001140361-25-012359-index.htm',
  'finalLink': 'https://www.sec.gov/Archives/edgar/data/320193/000114036125012359/ef20046400_sd.htm'},
 {'symbol': 'AAPL',
  'cik': '0000320193',
  'filingDate': '2025-04-03 00:00:00',
  'acceptedDate': '2025-04-03 18:33:37',
  'formType': '4',
  'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019325000045/0000320193-25-000045-index.htm',
  'finalLink': 'https://www.sec.gov/Archives/edgar/data/320193/000032019325000045/xslF345X05/wk-form4_1743719610.xml'},
 {'symbol': 'AAPL',
  'cik': '0000320193',
  'filingDate': '2025-04-03 00:00:00',
  'acceptedDate': '2025-04-03 18:32:58',
  'formType': '4',
  'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019325000044/0000320193-25-000044-index.htm',
  'finalLink': 'http

In [7]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance, PayloadSchemaType
from langchain.embeddings import OpenAIEmbeddings
import os
import re
import numpy as np
from datetime import datetime
from helper import (
    processing_html2txt,
    combine_sentences,
    calculate_cosine_distances,
    find_appropriate_threshold,
)
from pdf_to_gcp import HtmlToPdfGcpUploader
from sec_downloader import Downloader

# Qdrant setup
qdrant_client = QdrantClient(url="http://localhost:6333")

# Embedding model
oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
dl = Downloader("Traderware", "x.tan@traderverse.io")

def process_and_insert_filings_to_qdrant(file):
    # 1️⃣ Fetch & clean HTML
    try:
        html = dl.download_filing(url=file["finalLink"]).decode()
    except:
        html = HtmlToPdfGcpUploader().download_using_request(file["finalLink"])
    raw_text = processing_html2txt(html)

    # 2️⃣ Sentence splitting & combine
    sentence_texts = re.split(r"(?<=[.#:])\s+", raw_text)
    sentences = [{"sentence": s, "index": i} for i, s in enumerate(sentence_texts)]
    sentences = combine_sentences(sentences)

    # 3️⃣ Embed sentences for semantic chunking
    sent_embeds = oaiembeds.embed_documents([s["combined_sentence"] for s in sentences])
    for i, emb in enumerate(sent_embeds):
        sentences[i]["combined_sentence_embedding"] = emb

    # 4️⃣ Semantic chunking
    distances, sentences = calculate_cosine_distances(sentences)
    threshold, _, _ = find_appropriate_threshold(sentences, distances, 95, 1000)
    break_idx = np.percentile(distances, threshold)
    boundaries = [i for i, d in enumerate(distances) if d > break_idx]

    chunk_texts = []
    start = 0
    for b in boundaries:
        chunk_texts.append(" ".join(s["sentence"] for s in sentences[start : b + 1]))
        start = b + 1
    if start < len(sentences):
        chunk_texts.append(" ".join(s["sentence"] for s in sentences[start:]))

    # 5️⃣ Embed chunks
    chunk_embeddings = oaiembeds.embed_documents(chunk_texts)

    # 6️⃣ Ensure collection exists
    collection_name = "fillings"
    if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=len(chunk_embeddings[0]),
                distance=Distance.COSINE
            )
        )

    # 7️⃣ Ensure datetime index on "date"
    try:
        qdrant_client.create_payload_index(
            collection_name=collection_name,
            field_name="date",
            field_schema=PayloadSchemaType.DATETIME,
            wait=True
        )
    except Exception:
        # ignore if already exists
        pass

    # 8️⃣ Parse & normalize filing date
    raw_date = file["filingDate"].split(" ")[0]  # e.g. "2025-01-30"
    try:
        dt = datetime.strptime(raw_date, "%Y-%m-%d")
    except ValueError:
        dt = datetime.fromisoformat(raw_date)
    safe_date_iso = dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    # 9️⃣ Build points
    base_id = int(datetime.now().timestamp() * 1000)
    points = []
    for idx, (chunk, vector) in enumerate(zip(chunk_texts, chunk_embeddings)):
        points.append(
            PointStruct(
                id=base_id + idx,
                vector=vector,
                payload={
                    "content":      chunk,
                    "file_name":    f"{file['symbol']}_{file['formType']}_{raw_date}",
                    "ticker":       file["symbol"],
                    "filling_type": file["formType"],
                    "date":         safe_date_iso,
                }
            )
        )

    # 🔟 Upsert into Qdrant
    qdrant_client.upsert(collection_name=collection_name, points=points)
    print(f"✅ Inserted {len(points)} chunks into Qdrant for: {file['symbol']}")



In [8]:
def batch_process(data, batch_size=100):
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        try:
            for file in batch:
                if file["formType"] in {"8-K", "10-K", "10-Q"}:
                    print("Processing file: ",file['formType']," ",file['filingDate'])
                    process_and_insert_filings_to_qdrant(file)
        except Exception as e:
            print(f"Error processing batch {i}–{i+batch_size}: {e}")

batch_process(data, batch_size=100)


Processing file:  8-K   2025-02-25 00:00:00


  self._process_recursively(elements, _context=context)


✅ Inserted 3 chunks into Qdrant for: AAPL
Processing file:  10-Q   2025-01-31 00:00:00
✅ Inserted 29 chunks into Qdrant for: AAPL
Processing file:  8-K   2025-01-30 00:00:00
✅ Inserted 4 chunks into Qdrant for: AAPL
Processing file:  8-K   2025-01-03 00:00:00


  inner_elements = self._process_recursively(


✅ Inserted 3 chunks into Qdrant for: AAPL
Processing file:  10-K   2024-11-01 00:00:00


  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


✅ Inserted 206 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-10-31 00:00:00
✅ Inserted 4 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-09-10 00:00:00


  return self._process(elements)


✅ Inserted 3 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-08-26 00:00:00


  return self._process(elements)


✅ Inserted 2 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-08-23 00:00:00
✅ Inserted 493 chunks into Qdrant for: AAPL
Processing file:  10-Q   2024-08-02 00:00:00
✅ Inserted 35 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-08-01 00:00:00
✅ Inserted 3 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-05-03 00:00:00
✅ Inserted 89 chunks into Qdrant for: AAPL
Processing file:  10-Q   2024-05-03 00:00:00
✅ Inserted 27 chunks into Qdrant for: AAPL
Processing file:  8-K   2024-05-02 00:00:00
✅ Inserted 4 chunks into Qdrant for: AAPL
