In [6]:
import os
import time
import requests
# from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
from pymongo import MongoClient
from langchain_experimental.text_splitter import SemanticChunker
import requests

In [9]:
symbol = "AAPL"
years = [2025, 2024]
quarters = [1, 2, 3, 4]
fmp_api_key = "b6adf265209f12e18fd61e2f403585c3"

In [5]:
from pymongo import MongoClient
from langchain.embeddings import OpenAIEmbeddings
import os
import re
import numpy as np
from datetime import datetime
from helper import (
    processing_html2txt,
    combine_sentences,
    calculate_cosine_distances,
    find_appropriate_threshold,
)
from pdf_to_gcp import HtmlToPdfGcpUploader
from sec_downloader import Downloader

# MongoDB setup
mongo_client = MongoClient(os.getenv("MONGO_URI"))
db = mongo_client["qualitative"]
collection = db["earnings"]

# Embedding model (OpenAI)
oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
dl = Downloader("Traderware", "x.tan@traderverse.io")

def process_and_insert_to_mongodb(file):
    sentence_texts = re.split(r"(?<=[.#:])\s+", file[0].get("content", ""))
    sentences = [{"sentence": s, "index": i} for i, s in enumerate(sentence_texts)]
    sentences = combine_sentences(sentences)

    # Step 2: Embed each sentence (OpenAI)
    embeddings = oaiembeds.embed_documents(
        [x["combined_sentence"] for x in sentences]
    )
    for i, emb in enumerate(embeddings):
        sentences[i]["combined_sentence_embedding"] = emb

    # Step 3: Semantic chunking
    distances, sentences = calculate_cosine_distances(sentences)
    
    threshold, chunks, chunk_sizes = find_appropriate_threshold(sentences, distances, 95, 1000)
    breakpoint_distance_threshold = np.percentile(distances, threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

    # Step 4: Group sentences into chunks
    chunk_texts = []
    start_index = 0
    for index in indices_above_thresh:
        group = sentences[start_index : index + 1]
        chunk_texts.append(" ".join([d["sentence"] for d in group]))
        start_index = index + 1
    if start_index < len(sentences):
        chunk_texts.append(" ".join([d["sentence"] for d in sentences[start_index:]]))

    # Step 5: Embed chunks with OpenAI
    chunk_embeddings = oaiembeds.embed_documents(chunk_texts)

    # Step 6: Insert into MongoDB
    safe_date = file[0].get("date", "")
    for chunk, vector in zip(chunk_texts, chunk_embeddings):
        doc = {
            "content": chunk,
            "embedding": vector,
            "file_name": 
                f"{file[0].get("symbol", "")}_{file[0].get("period", "")}_{safe_date}",
            "ticker": file[0].get("symbol", ""),
            "quarter": file[0].get("period", ""),
            "date": datetime.fromisoformat(safe_date)
        }
        collection.insert_one(doc)

    print(f"✅ Inserted {len(chunk_texts)} chunks using OpenAI embeddings for: {file[0].get("symbol", "")}")


  oaiembeds = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))


In [10]:
# Ingest
for year in years:
    for quarter in quarters:
        print(f"Fetching {symbol} Q{quarter} {year}...")
        url = f"https://financialmodelingprep.com/stable/earning-call-transcript?symbol={symbol}&year={year}&quarter={quarter}&apikey={fmp_api_key}"
        response = requests.get(url)
        data = response.json()
        
        if not data:
            print(f"No data for Q{quarter} {year}. Skipping.")
            continue
        transcript = data[0].get("content", "")
        if not transcript:
            print(f"No transcript content for Q{quarter} {year}. Skipping.")
            continue
        try:
            process_and_insert_to_mongodb(data)
        except Exception as e:
            print(f"Error processing file {file[0].get("date", "")}: {e}")
        

Fetching AAPL Q1 2025...
✅ Inserted 81 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q2 2025...
No data for Q2 2025. Skipping.
Fetching AAPL Q3 2025...
No data for Q3 2025. Skipping.
Fetching AAPL Q4 2025...
No data for Q4 2025. Skipping.
Fetching AAPL Q1 2024...
✅ Inserted 136 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q2 2024...
✅ Inserted 114 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q3 2024...
✅ Inserted 77 chunks using OpenAI embeddings for: AAPL
Fetching AAPL Q4 2024...
✅ Inserted 81 chunks using OpenAI embeddings for: AAPL
