In [7]:
from __future__ import annotations
import sec_parser as sp
from sec_parser import (
    Edgar10QParser,
    TitleElement,
    TextElement,
    TopSectionTitle,
)
import sec_downloader as sd

In [8]:
import os
from dotenv import load_dotenv
from sec_parser import Edgar10QParser, TreeBuilder
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import re

load_dotenv()

True

In [9]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [10]:
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the breakpoint percentile threshold
breakpoint_percentile_threshold = 95  # set up the initial threshold value
chunk_size_ceiling = 2000

# extract all the text (excluding tables), convert to Markdown format
def convert_to_markdown(sections,level_to_markdown):
    markdown = ""
    for section in sections:
        if isinstance(section.semantic_element, (TopSectionTitle, TitleElement)):
            markdown += f"{level_to_markdown.get(section.semantic_element.level, '#')} {section.semantic_element.text}\n"
        elif isinstance(section.semantic_element, TextElement):
            markdown += f"{section.semantic_element.text}\n"
        for child in section.get_descendants():
            if isinstance(child.semantic_element, (TopSectionTitle, TitleElement)):
                markdown += f"{level_to_markdown.get(child.semantic_element.level, '#')} {child.semantic_element.text}\n"
            elif isinstance(child.semantic_element, TextElement):
                markdown += f"{child.semantic_element.text}\n"
    return markdown

def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ""

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]["sentence"] + " "

        # Add the current sentence
        combined_sentence += sentences[i]["sentence"]

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += " " + sentences[j]["sentence"]

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]["combined_sentence"] = combined_sentence

    return sentences

# Function to calculate chunk sizes
def calculate_chunk_sizes(sentences, distances, threshold):
    # Determine the distance threshold
    breakpoint_distance_threshold = np.percentile(distances, threshold)

    # Find the indices of distances above the threshold
    indices_above_thresh = [
        i for i, x in enumerate(distances) if x > breakpoint_distance_threshold
    ]

    # Initialize the start index
    start_index = 0
    chunks = []

    # Iterate through the breakpoints to slice the sentences
    for index in indices_above_thresh:
        end_index = index
        group = sentences[start_index : end_index + 1]
        combined_text = " ".join([d["sentence"] for d in group])
        chunks.append(combined_text)
        start_index = index + 1

    # The last group, if any sentences remain
    if start_index < len(sentences):
        combined_text = " ".join([d["sentence"] for d in sentences[start_index:]])
        chunks.append(combined_text)

    return chunks


# Function to find appropriate threshold
def find_appropriate_threshold(sentences, distances, initial_threshold, ceiling):
    threshold = initial_threshold
    while threshold > 0:
        chunks = calculate_chunk_sizes(sentences, distances, threshold)
        chunk_sizes = [len(chunk.split()) for chunk in chunks]
        if max(chunk_sizes) <= ceiling:
            break
        threshold -= 1
    return threshold, chunks, chunk_sizes

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]["combined_sentence_embedding"]
        embedding_next = sentences[i + 1]["combined_sentence_embedding"]

        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]

        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]["distance_to_next"] = distance

    # Optionally handle the last sentence
    # sentences[-1]['distance_to_next'] = None  # or a default value

    return distances, sentences

In [13]:
def process_financial_document(file_path):
    """
    Processes a financial document, chunks it, and stores the chunks in a FAISS vector store.

    Args:
        file_path (str): Path to the financial document file.

    Returns:
        FAISS: The FAISS vector store containing the document chunks.
    """
    # Load the environment variables from the .env file

    openai_api_key = os.getenv("OPENAI_API_KEY")

    # Step 1: Load and parse the financial document
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()
    
    parser = Edgar10QParser()
    elements = parser.parse(html_content)
    
    tree_builder = TreeBuilder()
    top_level_sections = [
        item for part in tree_builder.build(elements) for item in part.children
    ]

    # get levels
    levels = sorted(
        {
            k.semantic_element.level
            for k in top_level_sections
            if isinstance(k.semantic_element, (sp.TopSectionTitle, sp.TitleElement))
        }
    )
    level_to_markdown = {level: "#" * (i + 2) for i, level in enumerate(levels)}

    # extract all the text (excluding tables)
    raw_essay = convert_to_markdown(top_level_sections, level_to_markdown)

    new_txt_filename = "file_without_table.txt"

    # Put the extrated content in a new txt file
    with open(new_txt_filename, "w", encoding="utf-8") as file:
        file.write(raw_essay)

    # Read the content we just put in
    with open(new_txt_filename, "r", encoding="utf-8") as file:
        essay = file.read()

    # Splitting the essay on '.', '#', and ':'
    single_sentences_list = re.split(r"(?<=[.#:])\s+", essay)

    # Turn this list of sentence into a list of dictionaries for further embedding works
    sentences = [{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)]

    sentences = combine_sentences(sentences)

    oaiembeds = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

    # Now let's go get our embeddings. We'll do this in batch to make it quicker.
    embeddings = oaiembeds.embed_documents([x["combined_sentence"] for x in sentences])

    # add this list of embedding to our list of dicts
    for i, sentence in enumerate(sentences):
        sentence["combined_sentence_embedding"] = embeddings[i]

    # pull out the distances from our sentences and then add them as well

    distances, sentences = calculate_cosine_distances(sentences)

    # Use the function to find the appropriate threshold
    threshold, chunks, chunk_sizes = find_appropriate_threshold(
        sentences, distances, breakpoint_percentile_threshold, chunk_size_ceiling
    )

    print("Final threshold used:", threshold)

    # We need to get the distance threshold that we'll consider an outlier
    # We'll use numpy .percentile() for this
    breakpoint_distance_threshold = np.percentile(distances, threshold)

    # Then we'll see how many distances are actually above this one
    num_distances_above_theshold = len(
        [x for x in distances if x > breakpoint_distance_threshold]
    )  # The amount of distances above your threshold

    # Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
    indices_above_thresh = [
        i for i, x in enumerate(distances) if x > breakpoint_distance_threshold
    ]  # The indices of those breakpoints on your list

    # Initialize the start index
    start_index = 0
    
    # Create a list to hold the grouped sentences
    chunks = []
    
    # Iterate through the breakpoints to slice the sentences
    for index in indices_above_thresh:
        # The end index is the current breakpoint
        end_index = index
    
        # Slice the sentence_dicts from the current start index to the end index
        group = sentences[start_index : end_index + 1]
        combined_text = " ".join([d["sentence"] for d in group])
        chunks.append(combined_text)
    
        # Update the start index for the next group
        start_index = index + 1
    
    # The last group, if any sentences remain
    if start_index < len(sentences):
        combined_text = " ".join([d["sentence"] for d in sentences[start_index:]])
        chunks.append(combined_text)
    
    # grouped_sentences now contains the chunked sentences

    # `chunks` is a list of strings, we have to convert to page_content for storing
    documents = [Document(page_content=chunk) for chunk in chunks]

    embeddings_engine = OpenAIEmbeddings(openai_api_key=openai_api_key)
    # take embeddings of chunks and store in FAISS vector store
    vectorstore = FAISS.from_documents(documents, embeddings_engine)

    return vectorstore


In [14]:
doc_path = r"data/sec-edgar-filings/AAPL/10-k/2023/primary_document.html"
vs1 = process_financial_document(doc_path)

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Final threshold used: 83


In [15]:
# check how many vectors we have in the store
index = vs1.index
print(f"Number of vectors in the store: {index.ntotal}")

Number of vectors in the store: 158


In [16]:
# To look at the first 3 vectors, we can retrieve them using their IDs
for i in range(min(3, index.ntotal)):
    vector = index.reconstruct(i)
    print(f"Vector {i}: {vector}")

Vector 0: [ 0.0025478   0.00010713 -0.00851038 ... -0.0143102  -0.0221363
 -0.01553261]
Vector 1: [ 0.00370047 -0.01104865  0.00202998 ...  0.00796163  0.00790226
 -0.02796793]
Vector 2: [ 0.00859033 -0.00238253  0.01003133 ...  0.01358163 -0.01032371
 -0.01758441]


In [None]:
# Retrieve the first k relevant documents in the vector store
retriever = vs1.as_retriever(search_kwargs={"k": 3})

query = " How is the Segment Operating Performance of Apple company in Greater China?"

docs = retriever.invoke(query)
len(docs)

# Print the retrieved documents
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n")

In [None]:
doc_path = r"data\sec-edgar-filings\AAPL\10-q\2024\Q1\primary_document.html"
vs2 = process_financial_document(doc_path)

# check how many vectors we have in the store
index = vs2.index
print(f"Number of vectors in the store: {index.ntotal}")

# To look at the first 3 vectors, we can retrieve them using their IDs
for i in range(min(3, index.ntotal)):
    vector = index.reconstruct(i)
    print(f"Vector {i}: {vector}")

# Retrieve the first k relevant documents in the vector store
retriever = vs2.as_retriever(search_kwargs={"k": 3})

query = " How is the Segment Operating Performance of Apple company in Greater China?"

docs = retriever.invoke(query)
len(docs)

# Print the retrieved documents
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n")

In [10]:
import os
import re
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity




def convert_to_markdown(sections, level_to_markdown):
    markdown = ""
    for section in sections:
        if isinstance(section.semantic_element, (TopSectionTitle, TitleElement)):
            markdown += f"{level_to_markdown.get(section.semantic_element.level, '#')} {section.semantic_element.text}\n"
        elif isinstance(section.semantic_element, TextElement):
            markdown += f"{section.semantic_element.text}\n"
        for child in section.get_descendants():
            if isinstance(child.semantic_element, (TopSectionTitle, TitleElement)):
                markdown += f"{level_to_markdown.get(child.semantic_element.level, '#')} {child.semantic_element.text}\n"
            elif isinstance(child.semantic_element, TextElement):
                markdown += f"{child.semantic_element.text}\n"
    return markdown

def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ""
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]["sentence"] + " "
        combined_sentence += sentences[i]["sentence"]
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += " " + sentences[j]["sentence"]
        sentences[i]["combined_sentence"] = combined_sentence
    return sentences

def calculate_chunk_sizes(sentences, distances, threshold):
    breakpoint_distance_threshold = np.percentile(distances, threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
    start_index = 0
    chunks = []
    for index in indices_above_thresh:
        end_index = index
        group = sentences[start_index : end_index + 1]
        combined_text = " ".join([d["sentence"] for d in group])
        chunks.append(combined_text)
        start_index = index + 1
    if start_index < len(sentences):
        combined_text = " ".join([d["sentence"] for d in sentences[start_index:]])
        chunks.append(combined_text)
    return chunks

def find_appropriate_threshold(sentences, distances, initial_threshold, ceiling):
    threshold = initial_threshold
    while threshold > 0:
        chunks = calculate_chunk_sizes(sentences, distances, threshold)
        chunk_sizes = [len(chunk.split()) for chunk in chunks]
        if max(chunk_sizes) <= ceiling:
            break
        threshold -= 1
    return threshold, chunks, chunk_sizes

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]["combined_sentence_embedding"]
        embedding_next = sentences[i + 1]["combined_sentence_embedding"]
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]["distance_to_next"] = distance
    return distances, sentences

def process_financial_document(file_path):
    """
    Processes a financial document, chunks it, and stores the chunks in a FAISS vector store.

    Args:
        file_path (str): Path to the financial document file.

    Returns:
        FAISS: The FAISS vector store containing the document chunks.
    """
    openai_api_key = os.getenv("OPENAI_API_KEY")
    # Initialize the breakpoint percentile threshold
    breakpoint_percentile_threshold = 95  # set up the initial threshold value
    chunk_size_ceiling = 2000

    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()
    
    parser = Edgar10QParser()
    elements = parser.parse(html_content)
    
    tree_builder = TreeBuilder()
    top_level_sections = [item for part in tree_builder.build(elements) for item in part.children]

    levels = sorted(
        {k.semantic_element.level for k in top_level_sections if isinstance(k.semantic_element, (TopSectionTitle, TitleElement))}
    )
    level_to_markdown = {level: "#" * (i + 2) for i, level in enumerate(levels)}

    raw_essay = convert_to_markdown(top_level_sections, level_to_markdown)

    single_sentences_list = re.split(r"(?<=[.#:])\s+", raw_essay)
    sentences = [{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)]

    sentences = combine_sentences(sentences)

    oaiembeds = OpenAIEmbeddings(openai_api_key=openai_api_key)
    embeddings = oaiembeds.embed_documents([x["combined_sentence"] for x in sentences])

    for i, sentence in enumerate(sentences):
        sentence["combined_sentence_embedding"] = embeddings[i]

    distances, sentences = calculate_cosine_distances(sentences)

    threshold, chunks, chunk_sizes = find_appropriate_threshold(
        sentences, distances, breakpoint_percentile_threshold, chunk_size_ceiling
    )

    documents = [Document(page_content=chunk) for chunk in chunks]

    vectorstore = FAISS.from_documents(documents, oaiembeds)

    return vectorstore


In [13]:
doc_path = "data\\sec-edgar-filings\\META\\10-k\\2023\\primary_document.html"
vs2 = process_financial_document(doc_path)

# check how many vectors we have in the store
index = vs2.index
print(f"Number of vectors in the store: {index.ntotal}")

# To look at the first 3 vectors, we can retrieve them using their IDs
for i in range(min(3, index.ntotal)):
    vector = index.reconstruct(i)
    # print(f"Vector {i}: {vector}")

# Retrieve the first k relevant documents in the vector store
retriever = vs2.as_retriever(search_kwargs={"k": 3})

query = " How is the Segment Operating Performance of META company in Australia?"

docs = retriever.invoke(query)
len(docs)

# Print the retrieved documents
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n")

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Number of vectors in the store: 640
Document 1:
Corporate Information
We were incorporated in Delaware in July 2004. We completed our initial public offering in May 2012 and our Class A common stock is currently listed on the Nasdaq Global Select Market under the symbol "META." Our principal executive offices are located at 1 Meta Way, Menlo Park, California 94025, and our telephone number is (650) 543-4800.Meta, the Meta logo, Meta Quest, Meta Horizon, Facebook, FB, Instagram, Oculus, WhatsApp, Reels, and our other registered or common law trademarks, service marks, or trade names appearing in this Annual Report on Form 10-K are the property of Meta Platforms, Inc. or its affiliates.

Document 2:
# Opinion on Internal Control over Financial Reporting
We have audited Meta Platforms, Inc.'s internal control over financial reporting as of December 31, 2023, based on criteria established in Internal Control – Integrated Framework issued by the Committee of Sponsoring Organizations of the 

In [13]:
import os
import re
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity






In [15]:
file_paths = [
        r"data/sec-edgar-filings/AAPL/10-q/2024/Q1/primary_document.html",
        r"data/sec-edgar-filings/AAPL/10-k/2023/primary_document.html"
    ]
    
# Process the documents and create the FAISS vector store
vectorstore = process_financial_documents(file_paths)

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


In [16]:
# check how many vectors we have in the store
index = vectorstore.index
print(f"Number of vectors in the store: {index.ntotal}")

# To look at the first 3 vectors, we can retrieve them using their IDs
for i in range(min(3, index.ntotal)):
    vector = index.reconstruct(i)
    # print(f"Vector {i}: {vector}")

# Retrieve the first k relevant documents in the vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

query = " How is the Segment Operating Performance of Apple company in Asia?"

docs = retriever.invoke(query)
len(docs)

# Print the retrieved documents
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n")

Number of vectors in the store: 173
Document 1:
Segment Operating Performance
The following table shows net sales by reportable segment for 2023, 2022 and 2021 (dollars in millions): ## Americas
Americas net sales decreased 4% or $7.1 billion during 2023 compared to 2022 due to lower net sales of iPhone and Mac, partially offset by higher net sales of Services. ## Europe
Europe net sales decreased 1% or $824 million during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Europe net sales, which consisted primarily of lower net sales of Mac and Wearables, Home and Accessories, partially offset by higher net sales of iPhone and Services. ## Greater China
Greater China net sales decreased 2% or $1.6 billion during 2023 compared to 2022. The weakness in the renminbi relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Greater China net sales, which consist

In [19]:
import os
import re
import numpy as np
from typing import List, Annotated
from sklearn.metrics.pairwise import cosine_similarity


def convert_to_markdown(sections, level_to_markdown):
    markdown = ""
    for section in sections:
        if isinstance(section.semantic_element, (TopSectionTitle, TitleElement)):
            markdown += f"{level_to_markdown.get(section.semantic_element.level, '#')} {section.semantic_element.text}\n"
        elif isinstance(section.semantic_element, TextElement):
            markdown += f"{section.semantic_element.text}\n"
        for child in section.get_descendants():
            if isinstance(child.semantic_element, (TopSectionTitle, TitleElement)):
                markdown += f"{level_to_markdown.get(child.semantic_element.level, '#')} {child.semantic_element.text}\n"
            elif isinstance(child.semantic_element, TextElement):
                markdown += f"{child.semantic_element.text}\n"
    return markdown

def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ""
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]["sentence"] + " "
        combined_sentence += sentences[i]["sentence"]
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += " " + sentences[j]["sentence"]
        sentences[i]["combined_sentence"] = combined_sentence
    return sentences

def calculate_chunk_sizes(sentences, distances, threshold):
    breakpoint_distance_threshold = np.percentile(distances, threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
    start_index = 0
    chunks = []
    for index in indices_above_thresh:
        end_index = index
        group = sentences[start_index : end_index + 1]
        combined_text = " ".join([d["sentence"] for d in group])
        chunks.append(combined_text)
        start_index = index + 1
    if start_index < len(sentences):
        combined_text = " ".join([d["sentence"] for d in sentences[start_index:]])
        chunks.append(combined_text)
    return chunks

def find_appropriate_threshold(sentences, distances, initial_threshold, ceiling):
    threshold = initial_threshold
    while threshold > 0:
        chunks = calculate_chunk_sizes(sentences, distances, threshold)
        chunk_sizes = [len(chunk.split()) for chunk in chunks]
        if max(chunk_sizes) <= ceiling:
            break
        threshold -= 1
    return threshold, chunks, chunk_sizes

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]["combined_sentence_embedding"]
        embedding_next = sentences[i + 1]["combined_sentence_embedding"]
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]["distance_to_next"] = distance
    return distances, sentences

def process_financial_documents(file_paths: Annotated[str, "Comma-separated list of paths to the financial document files."]) -> FAISS:
    """
    Processes multiple financial documents, chunks them, and stores the chunks in a single FAISS vector store.

    Args:
        file_paths (str): Comma-separated list of paths to the financial document files.

    Returns:
        FAISS: The FAISS vector store containing the document chunks.
    """
    openai_api_key = os.getenv("OPENAI_API_KEY")
    breakpoint_percentile_threshold = 95
    chunk_size_ceiling = 2000
    all_chunks = []
    embeddings_engine = OpenAIEmbeddings(openai_api_key=openai_api_key)
    
    paths = file_paths.split(',')
    for file_path in paths:
        with open(file_path.strip(), "r", encoding="utf-8") as file:
            html_content = file.read()
        
        parser = Edgar10QParser()
        elements = parser.parse(html_content)
        
        tree_builder = TreeBuilder()
        top_level_sections = [item for part in tree_builder.build(elements) for item in part.children]

        levels = sorted(
            {k.semantic_element.level for k in top_level_sections if isinstance(k.semantic_element, (TopSectionTitle, TitleElement))}
        )
        level_to_markdown = {level: "#" * (i + 2) for i, level in enumerate(levels)}

        raw_essay = convert_to_markdown(top_level_sections, level_to_markdown)

        single_sentences_list = re.split(r"(?<=[.#:])\s+", raw_essay)
        sentences = [{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)]

        sentences = combine_sentences(sentences)

        embeddings = embeddings_engine.embed_documents([x["combined_sentence"] for x in sentences])

        for i, sentence in enumerate(sentences):
            sentence["combined_sentence_embedding"] = embeddings[i]

        distances, sentences = calculate_cosine_distances(sentences)

        threshold, chunks, chunk_sizes = find_appropriate_threshold(
            sentences, distances, breakpoint_percentile_threshold, chunk_size_ceiling
        )

        document_id = os.path.basename(file_path)
        for chunk in chunks:
            all_chunks.append(Document(page_content=chunk, metadata={"document_id": document_id}))

    vectorstore = FAISS.from_documents(all_chunks, embeddings_engine)
    return vectorstore


In [22]:
doc_path = "data/sec-edgar-filings/META/10-k/2023/primary_document.html,data/sec-edgar-filings/META/10-q/2024/Q2/primary_document.html,data/sec-edgar-filings/META/10-q/2023/Q3/primary_document.html"
vs2 = process_financial_documents(doc_path)

# check how many vectors we have in the store
index = vs2.index
print(f"Number of vectors in the store: {index.ntotal}")

# To look at the first 3 vectors, we can retrieve them using their IDs
for i in range(min(3, index.ntotal)):
    vector = index.reconstruct(i)
    # print(f"Vector {i}: {vector}")

# Retrieve the first k relevant documents in the vector store
retriever = vs2.as_retriever(search_kwargs={"k": 3})

query = " How is the Segment Operating Performance of META company in Australia?"

docs = retriever.invoke(query)
len(docs)

# Print the retrieved documents
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n")

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Number of vectors in the store: 1373
Document 1:
Corporate Information
We were incorporated in Delaware in July 2004. We completed our initial public offering in May 2012 and our Class A common stock is currently listed on the Nasdaq Global Select Market under the symbol "META." Our principal executive offices are located at 1 Meta Way, Menlo Park, California 94025, and our telephone number is (650) 543-4800.Meta, the Meta logo, Meta Quest, Meta Horizon, Facebook, FB, Instagram, Oculus, WhatsApp, Reels, and our other registered or common law trademarks, service marks, or trade names appearing in this Annual Report on Form 10-K are the property of Meta Platforms, Inc. or its affiliates.

Document 2:
# Opinion on Internal Control over Financial Reporting
We have audited Meta Platforms, Inc.'s internal control over financial reporting as of December 31, 2023, based on criteria established in Internal Control – Integrated Framework issued by the Committee of Sponsoring Organizations of the