In [9]:
import boto3
import instructor
import os
from IPython.display import Markdown, display
from pydantic import BaseModel
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

In [10]:
s3 = boto3.client("s3")
bedrock_client = boto3.client('bedrock-runtime')
client = instructor.from_bedrock(bedrock_client)


class Company10k(BaseModel):
    business_resume: str
    business_model: str
    risk_factor: list[str]
    property: list[str]
    sector: list[str]
    sub_sector: list[str]
    country_headquarters: list[str]
    country_of_production: list[str]
    country_of_operation: list[str]
    country_of_ressource: list[str]
    client_country: list[str]
    client_type: list[str]
    t_conformite: int


BUCKET = "csv-file-store-ec51f700"
BASE_PREFIX = "dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillings/"
OUTPUT_PREFIX = "dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume"


def extract_relevant_sections(html_text):
    soup = BeautifulSoup(html_text, "html.parser")

    for tag in soup(["script", "style", "table"]):
        tag.extract()

    text = soup.get_text(separator="\n")

    text = re.sub(r'\s+', ' ', text)  # supprimer les espaces multiples
    text = text.replace("\xa0", " ")  # supprimer les caract√®res sp√©ciaux
    text_upper = text.upper()

    def extract_section(start_marker, end_marker):
        start = text_upper.find(start_marker)
        if start == -1:
            return ""
        end = text_upper.find(end_marker, start)
        if end == -1:
            end = len(text_upper)
        return text[start:end]

    sections = [
        extract_section("ITEM 1.", "ITEM 1A."),  # Business
        extract_section("ITEM 1A.", "ITEM 2."),  # Risk Factors
        extract_section("ITEM 2.", "ITEM 3."),   # Properties
        extract_section("ITEM 7.", "ITEM 7A."),  # MD&A
        extract_section("ITEM 10.", "ITEM 11."), # Gouvernance
    ]

    combined_text = "\n\n".join([s for s in sections if s.strip() != ""])
    return combined_text.strip()


def get10kInformations(bucket: str, key: str) -> Company10k:
    obj = s3.get_object(Bucket=bucket, Key=key)
    text_10K = obj["Body"].read().decode("utf-8")

    text_to_analyze = extract_relevant_sections(text_10K)

    response = client.chat.completions.create(
        modelId="global.anthropic.claude-haiku-4-5-20251001-v1:0",
        messages=[
            {
                "role": "user",
                "content": (
                    "You are an expert financial and regulatory analyst specialized in SEC filings (10-K reports).\n\n"
                    "Extract the following information from the company report below, following this exact schema:\n\n"
                    "1. **business_resume** ‚Äì A detailed summary (2 entences) of what the company does, its main activities, and markets.\n"
                    "2. **business_model** ‚Äì A clear explanation (2 sentences) of how the company makes money (main sources of revenue or services provided).\n"
                    "3. **risk_factor** ‚Äì A list of key risks (3 sentences) (business, regulatory, financial, environmental, or geopolitical) mentioned in the report.\n"
                    "4. **property** ‚Äì List of important physical assets (factories, offices, warehouses, data centers, etc.).\n"
                    "5. **sector** ‚Äì Main industry sectors in which the company operates (e.g., Technology, Energy, Finance, Healthcare, etc.).\n"
                    "6. **sub_sector** ‚Äì More specific activity segments (e.g., Semiconductor Manufacturing, Cloud Services, Retail Banking, etc.).\n"
                    "7. **country_headquarters** ‚Äì Country or countries where the company‚Äôs headquarters are located.\n"
                    "8. **country_of_production** ‚Äì Countries where the main manufacturing or production takes place.\n"
                    "9. **country_of_operation** ‚Äì Countries where the company operates, sells products, or provides services.\n"
                    "10. **country_of_ressource** ‚Äì Countries where the company extracts or sources key raw materials or resources.\n"
                    "11. **client_country** ‚Äì Main countries or regions where the company‚Äôs clients or customers are located.\n"
                    "12. **client_type** ‚Äì Types of clients the company serves (choose from: 'private companies', 'public companies', 'governments', 'individual consumers').\n\n"
                    "13. **t_conformite** ‚Äì estime la **dur√©e estim√©e en mois (t_conformit√©)** n√©cessaire pour que cette entreprise atteigne la conformit√© avec la loi."
                    """Evaluer 10 facteurs :  
                    (0 = contrainte faible ‚Üí conformit√© rapide, 1 = contrainte forte ‚Üí conformit√© lente)

                    CT = Complexit√© technique (R&D, IT, adaptation produit)  
                    CAP = CapEx / financement requis  
                    DEP = D√©pendances externes (supply chain, licences, partenaires)  
                    CON = Contrats existants limitant les changements  
                    RES = Ressources internes (RH, ing√©nieurs, management)  
                    CYC = Cycle industriel ou de production  
                    PROC = Proc√©dures / autorisations administratives  
                    MAT = Maturit√© digitale (automatisation, ERP, data)  
                    GOV = Gouvernance / rapidit√© de d√©cision  
                    TAILLE = Taille et dispersion g√©ographique

                    Pond√©ration :  
                    CT 0.20, CAP 0.15, DEP 0.15, CON 0.10, RES 0.10, CYC 0.10, PROC 0.10, MAT 0.05, GOV 0.03, TAILLE 0.02

                    Calcule le score global :
                    S = somme(w_i √ó facteur_i)

                    Puis estime :
                    `t_conformit√© = round(1 + S √ó (36 ‚àí 1))`

                    (min = 1 mois, max = 36 mois)"""

                    "Extract the information from the text below: \n\n"
                    f"{text_to_analyze}"
                ),
            },
        ],
        response_model=Company10k,
        inferenceConfig={
            "maxTokens": 64000,
        }
    )
    return response

def process_single_filling(key: str):
    output_key = key.replace("/fillings/", "/fillingsResume/").replace(".html", ".json")

    try:
        try:
            obj = s3.get_object(Bucket=bucket, Key=output_key)
            json_data = obj["Body"].read().decode("utf-8")
            json_data.get("t_conformite")
            print(f"üìÑ Already exists: {output_key}")
        except:
            pass
        company_data = get10kInformations(BUCKET, key)
        json_data = company_data.model_dump_json(indent=2)

        s3.put_object(
            Bucket=BUCKET,
            Key=output_key,
            Body=json_data.encode("utf-8"),
            ContentType="application/json",
        )

        print(f"‚úÖ Overwritten: {output_key}")

    except Exception as e:
        print(f"‚ùå Error processing {key}: {e}")
        s3.put_object(
            Bucket=BUCKET,
            Key=f"{OUTPUT_PREFIX}/errors/{os.path.basename(key)}.log",
            Body=str(e).encode("utf-8"),
            ContentType="text/plain",
        )



def process_all_fillings(max_workers: int = 5):
    paginator = s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=BUCKET, Prefix=BASE_PREFIX)

    all_keys = []
    for page in pages:
        for obj in page.get("Contents", []):
            key = obj["Key"]
            if key.endswith(".html"):
                all_keys.append(key)

    print(f"üìÑ Found {len(all_keys)} files to process.")
    
    # ‚ö° ThreadPoolExecutor pour traiter plusieurs fichiers √† la fois
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_filling, key) for key in all_keys]

        for future in as_completed(futures):
            # On r√©cup√®re les exceptions si une t√¢che plante
            try:
                future.result()
            except Exception as e:
                print(f"‚ö†Ô∏è Thread error: {e}")

if __name__ == "__main__":
    process_all_fillings(5)

üìÑ Found 500 files to process.
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/AAPL/2024-11-01-10k-AAPL.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ABT/2025-02-21-10k-ABT.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ABBV/2025-02-14-10k-ABBV.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/A/2024-12-20-10k-A.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ABNB/2025-02-13-10k-ABNB.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ADBE/2025-01-13-10k-ADBE.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ACN/2024-10-10-10k-ACN.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ACGL/2025-02-27-10k-ACGL.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmkw/5h6d6xccl72dn4/dev/data/fillingsResume/ADI/2024-11-26-10k-ADI.json
‚úÖ Overwritten: dzd-3lz7fcr1rwmmk