In [1]:
import chromadb
import dotenv

def init_client():
    try:
        path = dotenv.get_key("../.env","path")
        if not path :
            raise Exception("No path found")
        persistent_client = chromadb.PersistentClient(path=path)
        return persistent_client
    except Exception as e:
        print("Error while initialising client : ",e)
import chromadb
from typing import List, Dict


In [4]:

def get_section_chunks(section : str, company : str, years : List[str|int]):
    try:
        result = {}
        client = init_client()
        collection = client.get_collection("labeled_chunks")
        for year in years:
            year = str(year)
            print(f"finding for {company}_{year} under the section : {section}")
            # query_res = collection.get(
            #     where={"company":company.lower()},
            #     include=["metadatas","documents"]
            # )
            get_res = client.get_collection("labeled_chunks").get(
                                                     where={"$and":[{"company":company},{"year":year},{"section":section}]},
                                                     include=['documents']
                                                    )
            result[(int)(year)] = get_res["documents"]
            # print(get_res)
            print(f"found {len(result[(int)(year)])} chunks for {company}_{year} under the section : {section}")
        return result
    except Exception as e:
        print("no client or collection")
        print(e)
        return {}

finding for apple_2024 under the section : risk_factors
found 258 chunks for apple_2024 under the section : risk_factors
finding for apple_2023 under the section : risk_factors
found 264 chunks for apple_2023 under the section : risk_factors
finding for apple_2222 under the section : risk_factors
found 0 chunks for apple_2222 under the section : risk_factors


In [71]:
from google import genai
from google.genai import types
import dotenv
import os

def get_client():
    # api_key = dotenv.get_key("../.env","gemini_api_key")
    dotenv.load_dotenv()
    api_key = os.getenv("gemini_api_key")
    client = genai.Client(api_key=api_key)
    # model = client.models.get(model="gemini-flash-2.0-lite")
    print(api_key)
    return client

def summarise_section(chunks:Dict[int,List[str]],company:str,years:List[int],section):
    try:
        client = get_client()
        summaries = {}
        SAFE_CHAR_LIMIT = 300_000/2
        SAFE_TOKEN_LIMIT = 1_000_000/1.5

        for year in years:
            chunk = chunks[year]
            
            batches = []
            current_batch = ""
            for chunk_text in chunk:
                if (len(chunk_text) + len(current_batch) > SAFE_CHAR_LIMIT):
                    batches.append(current_batch)
                    current_batch = chunk_text
                else:
                    current_batch += "\n\n" + chunk_text
            batches.append(current_batch)

            intermediate_summaries = []
            print(f"Mapping {len(batches)} batch(es) for {company} {year}...")

            # chunk_text = "\n".join(chunk)
            for batch_text in batches:
                map_prompt = f"Summarize the key points in the following text excerpt focus mainly on the topic {section} (this process is under a batching mechanism which will be used lastly for analysis, so try to not miss out on any information): \n\n{batch_text}"
                map_response = client.models.generate_content(model="gemini-2.0-flash",contents=map_prompt,
                                                              config=types.GenerateContentConfig(max_output_tokens=(int)(SAFE_TOKEN_LIMIT)))
                intermediate_summaries.append(map_response.text)
            

            combined_summary_text = "\n".join(intermediate_summaries)
            system_prompt = f"You are a SENIOR FINANCIAL ANALYST. Based on the following text ONLY under '{section.capitalize()}' disclosed by {company.capitalize()} in their {year} annual report. You are supposed to create an EXTENSIVE but CONCISE summary for it. If the RELEVANT content is not in text, say the data provided is insufficient, and move on. DON'T make ASSUMPTIONS. Provide it as a formal output, don't tell I am AI, or I will do this and this, just come to the main point instantly, with a concise heading"
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=combined_summary_text,
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt,
                    max_output_tokens=100_000
                )
            )
            summaries[year] = response.text
        return summaries
    except Exception as e:
        print("error while summarising")
        print(e)
        return None
        



finding for apple_2024 under the section : financial_statements
found 379 chunks for apple_2024 under the section : financial_statements
finding for apple_2023 under the section : financial_statements
found 357 chunks for apple_2023 under the section : financial_statements
AIzaSyDQdzexdFhSV2hL9ZFhDLXh49EQrlqmSk0
Mapping 2 batch(es) for Apple 2023...
Mapping 3 batch(es) for Apple 2024...


In [46]:
def compare_sections(summaries_by_year: Dict[int, str], company: str, section: str, years: List[int|str]) -> str:
    assert len(years) >= 2, f"Less than 2 years can't be compared"
    years.sort()

    try:

        year_1 = int(years[0])
        year_2 = int(years[-1])
        eff_years = [year_1,year_2]

        # summaries = {
        #     year_1 : summaries_by_year[year_1],
        #     year_2 : summaries_by_year[year_2]
        # }
        # summaries = str(summaries_by_year)
        comparison_text = f"""
        Summary for {year_1}:
        ---
        {summaries_by_year[year_1]}
        ---

        Summary for {year_2}:
        ---
        {summaries_by_year[year_2]}
        ---
        """


        client = get_client()
        system_prompt = f"""
        You are a Senior Financial Analsyst. 
        You will be provided with some information/data for the company {company},
        under the section {section},
        for the years : {years}.
        You are required to compare and analyse them and generate a bullet-point comparison. 
        Explicitly label changes as 'ADDED', 'REMOVED', or 'MODIFIED'. Focus only on significant differences.
        Keep the result extensive, but concise, to the point.
        Don't assume any other unobvious data, only do what ever is possible from the provided data.
        If data is insufficient, you may tell so, but in no case make you own data.
        Provide formal output, don't tell I am AI, or I will do this and this, just come to the main point instantly, with a concise heading.
        """

        response = client.models.generate_content(
            model = "gemini-2.0-flash",
            config=types.GenerateContentConfig(system_instruction=system_prompt),
            contents = comparison_text
        )
        return response.text or "no response"
    except Exception as e:
        print('error while comparing')
        print(e)
    

    





In [72]:
section = "general_body"
res = get_section_chunks(section=section,company="apple",years=[2024,2023])

finding for apple_2024 under the section : general_body
found 371 chunks for apple_2024 under the section : general_body
finding for apple_2023 under the section : general_body
found 160 chunks for apple_2023 under the section : general_body


In [73]:

summ = summarise_section(chunks=res,company="Apple",section=section,years=[2023,2024])
result = compare_sections(summaries_by_year=summ,company="apple",section=section,years=[2023,2024])

AIzaSyDQdzexdFhSV2hL9ZFhDLXh49EQrlqmSk0
Mapping 1 batch(es) for Apple 2023...
Mapping 3 batch(es) for Apple 2024...
AIzaSyDQdzexdFhSV2hL9ZFhDLXh49EQrlqmSk0


In [74]:
print(result)

## Apple Inc. Financial Analysis: 2023 vs. 2024

Here's a comparative analysis of Apple's financial data based on the provided summaries for 2023 and 2024:

**General Company Information:**

*   No significant change in company overview; Apple continues to design, manufacture, and market similar product lines and services.
*   No change in principal executive offices, located in Cupertino, CA.
*   No change in fiscal year end, which remains the last Saturday of September.

**Securities and Filing Status:**

*   No change in registered securities, with common stock (AAPL) trading on NASDAQ.
*   No change in filer status; Apple remains a large accelerated filer.

**Products:**

*   **MODIFIED:** Product lines include iPhone 16 Pro, iPhone 16, iPhone 15, iPhone 14 and iPhone SE in 2024 compared to just iPhone in 2023, indicating new product iterations.
*   **MODIFIED:** Product lines include MacBook Air, MacBook Pro, iMac, Mac Mini, Mac Studio, Mac Pro in 2024 compared to just Mac in 2023