In [1]:
import os
import time
import requests
# from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
from pymongo import MongoClient
from langchain_experimental.text_splitter import SemanticChunker
import requests

In [2]:
symbol = "AAPL"
years = [2025, 2024]
quarters = [1, 2, 3, 4]
fmp_api_key = "b6adf265209f12e18fd61e2f403585c3"

In [3]:
from pymongo import MongoClient
from langchain.embeddings import OpenAIEmbeddings
import os
import re
import numpy as np
from datetime import datetime
from helper import (
    processing_html2txt,
    combine_sentences,
    calculate_cosine_distances,
    find_appropriate_threshold,
)
from pdf_to_gcp import HtmlToPdfGcpUploader
from sec_downloader import Downloader

# MongoDB setup
mongo_client = MongoClient(os.getenv("MONGO_URI"))
db = mongo_client["qualitative"]
collection = db["earnings"]

# Embedding model (OpenAI)
oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
dl = Downloader("Traderware", "x.tan@traderverse.io")

def process_and_insert_to_mongodb(file):
    sentence_texts = re.split(r"(?<=[.#:])\s+", file[0].get("content", ""))
    sentences = [{"sentence": s, "index": i} for i, s in enumerate(sentence_texts)]
    sentences = combine_sentences(sentences)

    # Step 2: Embed each sentence (OpenAI)
    embeddings = oaiembeds.embed_documents(
        [x["combined_sentence"] for x in sentences]
    )
    for i, emb in enumerate(embeddings):
        sentences[i]["combined_sentence_embedding"] = emb

    # Step 3: Semantic chunking
    distances, sentences = calculate_cosine_distances(sentences)
    
    threshold, chunks, chunk_sizes = find_appropriate_threshold(sentences, distances, 95, 1000)
    breakpoint_distance_threshold = np.percentile(distances, threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

    # Step 4: Group sentences into chunks
    chunk_texts = []
    start_index = 0
    for index in indices_above_thresh:
        group = sentences[start_index : index + 1]
        chunk_texts.append(" ".join([d["sentence"] for d in group]))
        start_index = index + 1
    if start_index < len(sentences):
        chunk_texts.append(" ".join([d["sentence"] for d in sentences[start_index:]]))

    # Step 5: Embed chunks with OpenAI
    chunk_embeddings = oaiembeds.embed_documents(chunk_texts)

    # Step 6: Insert into MongoDB
    safe_date = file[0].get("date", "")
    for chunk, vector in zip(chunk_texts, chunk_embeddings):
        doc = {
            "content": chunk,
            "embedding": vector,
            "file_name": 
                f"{file[0].get("symbol", "")}_{file[0].get("period", "")}_{safe_date}",
            "ticker": file[0].get("symbol", ""),
            "quarter": file[0].get("period", ""),
            "date": datetime.fromisoformat(safe_date)
        }
        collection.insert_one(doc)

    print(f"✅ Inserted {len(chunk_texts)} chunks using OpenAI embeddings for: {file[0].get("symbol", "")}")


  oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))


In [10]:
# Ingest
for year in years:
    for quarter in quarters:
        print(f"Fetching {symbol} Q{quarter} {year}...")
        url = f"https://financialmodelingprep.com/stable/earning-call-transcript?symbol={symbol}&year={year}&quarter={quarter}&apikey={fmp_api_key}"
        response = requests.get(url)
        data = response.json()
        
        if not data:
            print(f"No data for Q{quarter} {year}. Skipping.")
            continue
        transcript = data[0].get("content", "")
        if not transcript:
            print(f"No transcript content for Q{quarter} {year}. Skipping.")
            continue
        try:
            process_and_insert_to_mongodb(data)
        except Exception as e:
            print(f"Error processing file {file[0].get("date", "")}: {e}")
        

Fetching AAPL Q1 2025...
✅ Inserted 81 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q2 2025...
No data for Q2 2025. Skipping.
Fetching AAPL Q3 2025...
No data for Q3 2025. Skipping.
Fetching AAPL Q4 2025...
No data for Q4 2025. Skipping.
Fetching AAPL Q1 2024...
✅ Inserted 136 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q2 2024...
✅ Inserted 114 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q3 2024...
✅ Inserted 77 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q4 2024...
✅ Inserted 81 chunks using OpenAI embeddings for: AAPL


In [9]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance, PayloadSchemaType
from langchain.embeddings import OpenAIEmbeddings
import os
import re
import numpy as np
from datetime import datetime
from helper import (
    processing_html2txt,
    combine_sentences,
    calculate_cosine_distances,
    find_appropriate_threshold,
)
from sec_downloader import Downloader

# Qdrant setup
qdrant_client = QdrantClient(url="http://localhost:6333")

# Embedding model
oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
dl = Downloader("Traderware", "x.tan@traderverse.io")

def process_and_insert_earnings_to_qdrant(file):
    # 1️⃣ Sentence splitting
    sentence_texts = re.split(r"(?<=[.#:])\s+", file[0].get("content", ""))
    sentences = [{"sentence": s, "index": i} for i, s in enumerate(sentence_texts)]
    sentences = combine_sentences(sentences)

    # 2️⃣ Embed sentences for semantic chunking
    sent_embeds = oaiembeds.embed_documents([s["combined_sentence"] for s in sentences])
    for i, emb in enumerate(sent_embeds):
        sentences[i]["combined_sentence_embedding"] = emb

    # 3️⃣ Semantic chunking
    distances, sentences = calculate_cosine_distances(sentences)
    threshold, _, _ = find_appropriate_threshold(sentences, distances, 95, 1000)
    break_idx = np.percentile(distances, threshold)
    boundaries = [i for i, d in enumerate(distances) if d > break_idx]

    chunk_texts = []
    start = 0
    for b in boundaries:
        chunk_texts.append(" ".join(s["sentence"] for s in sentences[start : b + 1]))
        start = b + 1
    if start < len(sentences):
        chunk_texts.append(" ".join(s["sentence"] for s in sentences[start:]))

    # 4️⃣ Embed chunks
    chunk_embeddings = oaiembeds.embed_documents(chunk_texts)

    # 5️⃣ Ensure collection + payload index exists
    collection_name = "earnings"
    if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=len(chunk_embeddings[0]),
                distance=Distance.COSINE
            )
        )
        qdrant_client.create_payload_index(
            collection_name=collection_name,
            field_name="date",
            field_schema=PayloadSchemaType.DATETIME,
            wait=True
        )

    # 6️⃣ Parse & normalize date to "YYYY-MM-DDThh:mm:ssZ"
    raw_date = file[0].get("date", "")
    safe_date_iso = None
    if raw_date:
        try:
            dt = datetime.strptime(raw_date, "%Y-%m-%d")
        except ValueError:
            dt = datetime.fromisoformat(raw_date)
        safe_date_iso = dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    # 7️⃣ Build and upsert points
    base_id = int(datetime.now().timestamp() * 1000)
    points = []
    for idx, (chunk, vector) in enumerate(zip(chunk_texts, chunk_embeddings)):
        points.append(
            PointStruct(
                id=base_id + idx,
                vector=vector,
                payload={
                    "content":   chunk,
                    "file_name": f"{file[0].get('symbol','')}_{file[0].get('period','')}_{safe_date_iso or ''}",
                    "ticker":    file[0].get("symbol", ""),
                    "quarter":   file[0].get("period", ""),
                    "date":      safe_date_iso,
                }
            )
        )

    qdrant_client.upsert(collection_name=collection_name, points=points)
    print(f"✅ Inserted {len(points)} earnings chunks for: {file[0].get('symbol','')}")


In [None]:
# Ingest
for year in years:
    for quarter in quarters:
        print(f"Fetching {symbol} Q{quarter} {year}...")
        url = f"https://financialmodelingprep.com/stable/earning-call-transcript?symbol={symbol}&year={year}&quarter={quarter}&apikey={fmp_api_key}"
        response = requests.get(url)
        data = response.json()
        
        if not data:
            print(f"No data for Q{quarter} {year}. Skipping.")
            continue
        transcript = data[0].get("content", "")
        if not transcript:
            print(f"No transcript content for Q{quarter} {year}. Skipping.")
            continue
        try:
            process_and_insert_earnings_to_qdrant(data)
        except Exception as e:
            print(f"Error processing file {data[0].get("date", "")}: {e}")
        

Fetching AAPL Q1 2025...
✅ Inserted 81 earnings chunks for: AAPL
Fetching AAPL Q2 2025...
No data for Q2 2025. Skipping.
Fetching AAPL Q3 2025...
No data for Q3 2025. Skipping.
Fetching AAPL Q4 2025...
No data for Q4 2025. Skipping.
Fetching AAPL Q1 2024...
✅ Inserted 136 earnings chunks for: AAPL
Fetching AAPL Q2 2024...
✅ Inserted 114 earnings chunks for: AAPL
Fetching AAPL Q3 2024...
✅ Inserted 77 earnings chunks for: AAPL
Fetching AAPL Q4 2024...
✅ Inserted 81 earnings chunks for: AAPL


: 

In [None]:
import os
import re
import numpy as np
from datetime import datetime

import weaviate
from langchain.embeddings import OpenAIEmbeddings

from helper import (
    processing_html2txt,
    combine_sentences,
    calculate_cosine_distances,
    find_appropriate_threshold,
)
from sec_downloader import Downloader

# ──────────────────────────────────────────────────────────────────────────────
# 🚀 Weaviate client (local Docker)
client = weaviate.connect_to_local()

CLASS_NAME = "earnings"
# 1️⃣ Ensure class schema exists
if CLASS_NAME not in client.collections.list_all(simple=True):
    class_schema = {
        "class": CLASS_NAME,
        "description": "Chunks of company earnings content with metadata",
        "vectorizer": "none",
        "vectorIndexType": "hnsw",
        "vectorIndexConfig": {"distance": "cosine"},
        "properties": [
            {"name": "content",    "dataType": ["text"]},
            {"name": "file_name",  "dataType": ["string"]},
            {"name": "ticker",     "dataType": ["string"]},
            {"name": "quarter",    "dataType": ["string"]},
            {"name": "date",       "dataType": ["date"]},
        ],
    }
    client.collections.create_from_dict(class_schema)

# 2️⃣ Embedding model & downloader
oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
dl = Downloader("Traderware", "x.tan@traderverse.io")

def process_and_insert_earnings_to_weaviate(file):
    # A. Sentence split & combine
    raw_content = file[0].get("content", "")
    sentence_texts = re.split(r"(?<=[.#:])\s+", raw_content)
    sentences = [{"sentence": s, "index": i} for i, s in enumerate(sentence_texts)]
    sentences = combine_sentences(sentences)

    # B. Embed for semantic chunking
    sent_texts = [s["combined_sentence"] for s in sentences]
    sent_embeds = oaiembeds.embed_documents(sent_texts)
    for i, emb in enumerate(sent_embeds):
        sentences[i]["combined_sentence_embedding"] = emb

    # C. Compute boundaries
    distances, sentences = calculate_cosine_distances(sentences)
    thresh, _, _ = find_appropriate_threshold(sentences, distances, 95, 1000)
    break_val = np.percentile(distances, thresh)
    boundaries = [i for i, d in enumerate(distances) if d > break_val]

    # D. Build chunks
    chunk_texts, start = [], 0
    for b in boundaries:
        chunk_texts.append(" ".join(s["sentence"] for s in sentences[start:b+1]))
        start = b + 1
    if start < len(sentences):
        chunk_texts.append(" ".join(s["sentence"] for s in sentences[start:]))

    # E. Embed final chunks
    chunk_embeddings = oaiembeds.embed_documents(chunk_texts)

    # F. Normalize date to ISO
    raw_date = file[0].get("date", "")
    iso_date = ""
    if raw_date:
        try:
            dt = datetime.strptime(raw_date, "%Y-%m-%d")
        except ValueError:
            dt = datetime.fromisoformat(raw_date)
        iso_date = dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    # G. Batch insert via collection-scoped API
    collection = client.collections.get(CLASS_NAME)
    with collection.batch.fixed_size(batch_size=50, concurrent_requests=4) as batch:
        for idx, (chunk, vector) in enumerate(zip(chunk_texts, chunk_embeddings)):
            batch.add_object(
                properties={
                    "content":   chunk,
                    "file_name": f"{file[0].get('symbol','')}_{file[0].get('period','')}_{iso_date}",
                    "ticker":    file[0].get("symbol",""),
                    "quarter":   file[0].get("period",""),
                    "date":      iso_date,
                },
                vector=vector
            )

    print(f"✅ Upserted {len(chunk_texts)} earnings chunks for {file[0].get('symbol','')}")


            Please make sure to close the connection using `client.close()`.
  client = weaviate.connect_to_local()


UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Earnings already exists'}]}.

: 

In [5]:
# Ingest
for year in years:
    for quarter in quarters:
        print(f"Fetching {symbol} Q{quarter} {year}...")
        url = f"https://financialmodelingprep.com/stable/earning-call-transcript?symbol={symbol}&year={year}&quarter={quarter}&apikey={fmp_api_key}"
        response = requests.get(url)
        data = response.json()
        
        if not data:
            print(f"No data for Q{quarter} {year}. Skipping.")
            continue
        transcript = data[0].get("content", "")
        if not transcript:
            print(f"No transcript content for Q{quarter} {year}. Skipping.")
            continue
        try:
            process_and_insert_earnings_to_weaviate(data)
        except Exception as e:
            print(f"Error processing file {data[0].get("date", "")}: {e}")
        

Fetching AAPL Q1 2025...


c:\Users\hp\RAG_17-04-2025\venv\Lib\site-packages\langchain_community\embeddings\openai.py:503: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response = response.dict()
c:\Users\hp\RAG_17-04-2025\venv\Lib\site-packages\langchain_community\embeddings\openai.py:503: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response = response.dict()


✅ Upserted 81 earnings chunks for AAPL
Fetching AAPL Q2 2025...
No data for Q2 2025. Skipping.
Fetching AAPL Q3 2025...
No data for Q3 2025. Skipping.
Fetching AAPL Q4 2025...
No data for Q4 2025. Skipping.
Fetching AAPL Q1 2024...


c:\Users\hp\RAG_17-04-2025\venv\Lib\site-packages\langchain_community\embeddings\openai.py:503: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response = response.dict()


✅ Upserted 136 earnings chunks for AAPL
Fetching AAPL Q2 2024...


c:\Users\hp\RAG_17-04-2025\venv\Lib\site-packages\langchain_community\embeddings\openai.py:503: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response = response.dict()


✅ Upserted 114 earnings chunks for AAPL
Fetching AAPL Q3 2024...


c:\Users\hp\RAG_17-04-2025\venv\Lib\site-packages\langchain_community\embeddings\openai.py:503: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response = response.dict()


✅ Upserted 77 earnings chunks for AAPL
Fetching AAPL Q4 2024...


c:\Users\hp\RAG_17-04-2025\venv\Lib\site-packages\langchain_community\embeddings\openai.py:503: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response = response.dict()


✅ Upserted 81 earnings chunks for AAPL
