In [2]:
import chromadb
import dotenv

def init_client():
    try:
        path = dotenv.get_key("../.env","path")
        if not path :
            raise Exception("No path found")
        persistent_client = chromadb.PersistentClient(path=path)
        return persistent_client
    except Exception as e:
        print("Error while initialising client : ",e)
import chromadb
from typing import List, Dict


In [3]:

def get_section_chunks(section : str, company : str, years : List[str]):
    try:
        result = {}
        client = init_client()
        collection = client.get_collection("labeled_chunks")
        for year in years:
            print(f"finding for {company}_{year} under the section : {section}")
            # query_res = collection.get(
            #     where={"company":company.lower()},
            #     include=["metadatas","documents"]
            # )
            get_res = client.get_collection("labeled_chunks").get(
                                                     where={"$and":[{"company":company},{"year":year},{"section":section}]},
                                                     include=['documents']
                                                    )
            result[(int)(year)] = get_res["documents"]
            # print(get_res)
            print(f"found {len(result[(int)(year)])} chunks for {company}_{year} under the section : {section}")
        return result
    except Exception as e:
        print("no client or collection")
        print(e)
        return {}

In [103]:
res = get_section_chunks(section="risk_factors",company="apple",years=["2024","2023"])

finding for apple_2024 under the section : risk_factors
found 258 chunks for apple_2024 under the section : risk_factors
finding for apple_2023 under the section : risk_factors
found 264 chunks for apple_2023 under the section : risk_factors


In [122]:
from google import genai
from google.genai import types
import dotenv
import os

def get_client():
    # api_key = dotenv.get_key("../.env","gemini_api_key")
    dotenv.load_dotenv()
    api_key = os.getenv("gemini_api_key")
    client = genai.Client(api_key=api_key)
    print(api_key)
    return client

def summarise_section(chunks:Dict[int,List[str]],company:str,years:List[int],section):
    try:
        client = get_client()
        summaries = {}
        SAFE_CHAR_LIMIT = 300_000/2
        SAFE_TOKEN_LIMIT = 1_000_000/1.5

        for year in years:
            chunk = chunks[year]
            
            batches = []
            current_batch = ""
            for chunk_text in chunk:
                if (len(chunk_text) + len(current_batch) > SAFE_CHAR_LIMIT):
                    batches.append(current_batch)
                    current_batch = chunk_text
                else:
                    current_batch += "\n\n" + chunk_text
            batches.append(current_batch)

            intermediate_summaries = []
            print(f"Mapping {len(batches)} batch(es) for {company} {year}...")

            # chunk_text = "\n".join(chunk)
            for batch_text in batches:
                map_prompt = f"Summarize the key points in the following text excerpt focus mainly on the topic {section} (this process is under a batching mechanism which will be used lastly for analysis, so try to not miss out on any information): \n\n{batch_text}"
                map_response = client.models.generate_content(model="gemini-2.0-flash",contents=map_prompt,
                                                              config=types.GenerateContentConfig(max_output_tokens=(int)(SAFE_TOKEN_LIMIT)))
                intermediate_summaries.append(map_response.text)
            

            combined_summary_text = "\n".join(intermediate_summaries)
            system_prompt = f"You are a SENIOR FINANCIAL ANALYST. Based on the following text ONLY under '{section.capitalize()}' disclosed by {company.capitalize()} in their {year} annual report. You are supposed to create an EXTENSIVE but CONCISE summary for it. If the RELEVANT content is not in text, say the data provided is insufficient, and move on. DON'T make ASSUMPTIONS. Provide it as a formal output, don't tell I am AI, or I will do this and this, just come to the main point instantly, with a concise heading"
            response = client.models.generate_content(
                model="gemini-2.0-flash-lite",
                contents=combined_summary_text,
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt
                )
            )
            summaries[year] = response.text
        return summaries
    except Exception as e:
        print("error while summarising")
        print(e)
        return None
        



In [123]:
summ = summarise_section(chunks=res,company="Apple",section="risk factors",years=[2023,2024])

AIzaSyDQdzexdFhSV2hL9ZFhDLXh49EQrlqmSk0
Mapping 3 batch(es) for Apple 2023...
Mapping 2 batch(es) for Apple 2024...


In [124]:
print(summ[2024])

## Apple 2024 Risk Factor Summary

This summary outlines the key risk factors disclosed by Apple in its 2024 annual report, categorized for clarity:

**I. Macroeconomic & Operational Risks:**

*   **Economic Volatility:** Global/regional economic downturns (slow growth, recession, inflation, unemployment, interest rate hikes) negatively impact consumer spending, international operations, and supply chains. Currency fluctuations add further risk.
*   **International Dependencies:** Reliance on international markets (sales, supply chain, manufacturing) heightens exposure to global economic, political, and trade-related disruptions (tariffs, disputes, geopolitical tensions, conflict).
*   **Supply Chain Disruptions:** Concentrated supply chains and reliance on single or limited sources (components, manufacturing) amplify vulnerability to disruptions due to natural disasters, public health crises (pandemics), industrial accidents, cyberattacks, and geopolitical conflicts.
*   **Partner Vul

In [118]:
summ[2023]

"Here's a concise summary of the risk factors disclosed by Apple in their 2023 annual report (based on the provided text excerpt):\n\n**Market & Competitive Risks:**\n*   Intense competition leading to margin pressure.\n*   Rapid technological change and short product lifecycles.\n*   Price sensitivity of consumers.\n*   Competitor actions (pricing, imitation).\n*   Minority market share in key markets.\n*   Market contractions increasing competition.\n*   Competition in services from free providers.\n\n**Supply Chain & Manufacturing Risks:**\n*   Reliance on single or limited component sources.\n*   Potential for component shortages and pricing fluctuations.\n*   Dependence on custom components from single sources.\n*   Supplier financial instability.\n*   Manufacturing primarily outsourced, often in specific locations.\n*   Supply chain disruptions from various factors.\n*   Risks associated with outsourcing.\n*   Disruptions to manufacturing, logistics, and transit.\n*   Risks assoc