In [10]:
import boto3
import instructor
import os
from IPython.display import Markdown, display
from pydantic import BaseModel
from bs4 import BeautifulSoup
import re

In [12]:
s3 = boto3.client("s3")
bedrock_client = boto3.client('bedrock-runtime')
client = instructor.from_bedrock(bedrock_client)


class Company10k(BaseModel):
    business_resume: str
    business_model: str
    risk_factor: list[str]
    property: list[str]
    sector: list[str]
    sub_sector: list[str]
    country_headquarters: list[str]
    country_of_production: list[str]
    country_of_operation: list[str]
    country_of_ressource: list[str]
    client_country: list[str]
    client_type: list[str]


BUCKET = "csv-file-store-ec51f700"
KEY = "dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillings/AAPL/2024-11-01-10k-AAPL.html"


def extract_relevant_sections(html_text):
    # Charger le HTML avec BeautifulSoup
    soup = BeautifulSoup(html_text, "html.parser")

    # Supprimer les scripts, styles et tableaux inutiles (table des matières)
    for tag in soup(["script", "style", "table"]):
        tag.extract()

    # Convertir en texte brut
    text = soup.get_text(separator="\n")

    # Nettoyage de base
    text = re.sub(r'\s+', ' ', text)  # supprimer les espaces multiples
    text = text.replace("\xa0", " ")  # supprimer les caractères spéciaux
    text_upper = text.upper()

    # Helper pour extraire entre deux items
    def extract_section(start_marker, end_marker):
        start = text_upper.find(start_marker)
        if start == -1:
            return ""
        end = text_upper.find(end_marker, start)
        if end == -1:
            end = len(text_upper)
        return text[start:end]

    # Extraire les sections utiles
    sections = [
        extract_section("ITEM 1.", "ITEM 1A."),  # Business
        extract_section("ITEM 1A.", "ITEM 2."),  # Risk Factors
        extract_section("ITEM 2.", "ITEM 3."),   # Properties
        extract_section("ITEM 7.", "ITEM 7A."),  # MD&A
    ]

    # Combiner le texte
    combined_text = "\n\n".join([s for s in sections if s.strip() != ""])
    return combined_text.strip()


def get10kInformations(bucket: str = BUCKET, key: str = KEY) -> Company10k:
    obj = s3.get_object(Bucket=bucket, Key=key)
    text_10K = obj["Body"].read().decode("utf-8")

    text_to_analyze = extract_relevant_sections(text_10K)

    response = client.chat.completions.create(
        modelId="global.anthropic.claude-haiku-4-5-20251001-v1:0",
        messages=[
            {
                "role": "user",
                "content": (
                    "You are an expert financial and regulatory analyst specialized in SEC filings (10-K reports).\n\n"
                    "Extract the following information from the company report below, following this exact schema:\n\n"
                    "1. **business_resume** – A detailed summary (2 entences) of what the company does, its main activities, and markets.\n"
                    "2. **business_model** – A clear explanation (2 sentences) of how the company makes money (main sources of revenue or services provided).\n"
                    "3. **risk_factor** – A list of key risks (business, regulatory, financial, environmental, or geopolitical) mentioned in the report.\n"
                    "4. **property** – List of important physical assets (factories, offices, warehouses, data centers, etc.).\n"
                    "5. **sector** – Main industry sectors in which the company operates (e.g., Technology, Energy, Finance, Healthcare, etc.).\n"
                    "6. **sub_sector** – More specific activity segments (e.g., Semiconductor Manufacturing, Cloud Services, Retail Banking, etc.).\n"
                    "7. **country_headquarters** – Country or countries where the company’s headquarters are located.\n"
                    "8. **country_of_production** – Countries where the main manufacturing or production takes place.\n"
                    "9. **country_of_operation** – Countries where the company operates, sells products, or provides services.\n"
                    "10. **country_of_ressource** – Countries where the company extracts or sources key raw materials or resources.\n"
                    "11. **client_country** – Main countries or regions where the company’s clients or customers are located.\n"
                    "12. **client_type** – Types of clients the company serves (choose from: 'private companies', 'public companies', 'governments', 'individual consumers').\n\n"
                    f"{text_to_analyze}"
                ),
            },
        ],
        response_model=Company10k,
        inferenceConfig={
            "maxTokens": 64000,
        }
    )

    display(response)


if __name__ == "__main__":
    Company_10k_info = get10kInformations()
    print(Company_10k_info)

Company10k(business_resume='Apple Inc. designs, manufactures, and markets a comprehensive range of consumer electronics including smartphones (iPhone), personal computers (Mac), tablets (iPad), wearables (Apple Watch, AirPods), and accessories. The company also provides a variety of related services including AppleCare support, cloud services, digital content platforms (App Store, Apple Music, Apple TV+), advertising services, and payment services (Apple Pay, Apple Card). Apple operates globally with significant international presence, serving consumer, business, education, enterprise, and government markets through both direct channels (retail stores, online) and indirect distribution partners.', business_model='Apple generates revenue through two primary segments: Products (hardware sales including iPhones, Macs, iPads, wearables, and accessories) and Services (recurring revenue from software, digital content, support services, and financial services). The company maintains high gros